diff options
Diffstat (limited to 'fs')
531 files changed, 25674 insertions, 15413 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 7e051147679..814ac4e213a 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -9,6 +9,8 @@ config 9P_FS If unsure, say N. +if 9P_FS + config 9P_FSCACHE bool "Enable 9P client caching support (EXPERIMENTAL)" depends on EXPERIMENTAL @@ -20,7 +22,6 @@ config 9P_FSCACHE config 9P_FS_POSIX_ACL bool "9P POSIX Access Control Lists" - depends on 9P_FS select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -30,3 +31,5 @@ config 9P_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +endif diff --git a/fs/9p/Makefile b/fs/9p/Makefile index f8ba37effd1..ab8c1278063 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o 9p-objs := \ vfs_super.o \ vfs_inode.o \ + vfs_inode_dotl.o \ vfs_addr.o \ vfs_file.o \ vfs_dir.o \ diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 12d602351db..02a2cf61631 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -28,7 +28,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) { ssize_t size; void *value = NULL; - struct posix_acl *acl = NULL;; + struct posix_acl *acl = NULL; size = v9fs_fid_xattr_get(fid, name, NULL, 0); if (size > 0) { @@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -int v9fs_check_acl(struct inode *inode, int mask) +int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; struct v9fs_session_info *v9ses; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + v9ses = v9fs_inode2v9ses(inode); if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { /* @@ -362,7 +365,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, case ACL_TYPE_DEFAULT: name = POSIX_ACL_XATTR_DEFAULT; if (!S_ISDIR(inode->i_mode)) { - retval = -EINVAL; + retval = acl ? -EINVAL : 0; goto err_out; } break; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 59e18c2e8c7..7ef3ac9f6d9 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,7 +16,7 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern int v9fs_check_acl(struct inode *inode, int mask); +extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, struct posix_acl *, struct posix_acl *); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index cb6396855e2..c4b5d8864f0 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -113,9 +113,27 @@ struct v9fs_session_info { struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); -void v9fs_session_close(struct v9fs_session_info *v9ses); -void v9fs_session_cancel(struct v9fs_session_info *v9ses); -void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); +extern void v9fs_session_close(struct v9fs_session_info *v9ses); +extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); +extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); +extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nameidata); +extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); +extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); +extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); +extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *p); +extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb); + +extern const struct inode_operations v9fs_dir_inode_operations_dotl; +extern const struct inode_operations v9fs_file_inode_operations_dotl; +extern const struct inode_operations v9fs_symlink_inode_operations_dotl; +extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb); /* other default globals */ #define V9FS_PORT 564 @@ -138,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000L; } + +/** + * v9fs_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_dotl(v9ses, fid, sb); + else + return v9fs_inode(v9ses, fid, sb); +} diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index bab0eac873f..b789f8e597e 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); -void v9fs_dentry_release(struct dentry *); int v9fs_uflags2omode(int uflags, int extended); ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index cbf4e50f393..233b7d4ffe5 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -51,7 +51,7 @@ * */ -static int v9fs_dentry_delete(struct dentry *dentry) +static int v9fs_dentry_delete(const struct dentry *dentry) { P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, dentry); @@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry) * */ -static int v9fs_cached_dentry_delete(struct dentry *dentry) +static int v9fs_cached_dentry_delete(const struct dentry *dentry) { struct inode *inode = dentry->d_inode; P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, @@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry) * */ -void v9fs_dentry_release(struct dentry *dentry) +static void v9fs_dentry_release(struct dentry *dentry) { struct v9fs_dentry *dent; struct p9_fid *temp, *current_fid; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 34bf71b5654..b76a40bdf4c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -49,15 +49,8 @@ static const struct inode_operations v9fs_dir_inode_operations; static const struct inode_operations v9fs_dir_inode_operations_dotu; -static const struct inode_operations v9fs_dir_inode_operations_dotl; static const struct inode_operations v9fs_file_inode_operations; -static const struct inode_operations v9fs_file_inode_operations_dotl; static const struct inode_operations v9fs_symlink_inode_operations; -static const struct inode_operations v9fs_symlink_inode_operations_dotl; - -static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, - dev_t rdev); /** * unixmode2p9mode - convert unix mode bits to plan 9 @@ -237,46 +230,18 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) * */ -void v9fs_destroy_inode(struct inode *inode) +static void v9fs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); } -#endif -/** - * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a - * new file system object. This checks the S_ISGID to determine the owning - * group of the new file system object. - */ - -static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) -{ - BUG_ON(dir_inode == NULL); - - if (dir_inode->i_mode & S_ISGID) { - /* set_gid bit is set.*/ - return dir_inode->i_gid; - } - return current_fsgid(); -} - -/** - * v9fs_dentry_from_dir_inode - helper function to get the dentry from - * dir inode. - * - */ - -static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) +void v9fs_destroy_inode(struct inode *inode) { - struct dentry *dentry; - - spin_lock(&dcache_lock); - /* Directory should have only one entry. */ - BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); - dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); - spin_unlock(&dcache_lock); - return dentry; + call_rcu(&inode->i_rcu, v9fs_i_callback); } +#endif /** * v9fs_get_inode - helper function to setup an inode @@ -447,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode) #endif } -static struct inode * +struct inode * v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { @@ -482,60 +447,6 @@ error: return ERR_PTR(err); } -static struct inode * -v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) -{ - struct inode *ret = NULL; - int err; - struct p9_stat_dotl *st; - - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); - if (IS_ERR(st)) - return ERR_CAST(st); - - ret = v9fs_get_inode(sb, st->st_mode); - if (IS_ERR(ret)) { - err = PTR_ERR(ret); - goto error; - } - - v9fs_stat2inode_dotl(st, ret); - ret->i_ino = v9fs_qid2ino(&st->qid); -#ifdef CONFIG_9P_FSCACHE - v9fs_vcookie_set_qid(ret, &st->qid); - v9fs_cache_inode_get_cookie(ret); -#endif - err = v9fs_get_acl(ret, fid); - if (err) { - iput(ret); - goto error; - } - kfree(st); - return ret; -error: - kfree(st); - return ERR_PTR(err); -} - -/** - * v9fs_inode_from_fid - Helper routine to populate an inode by - * issuing a attribute request - * @v9ses: session information - * @fid: fid to issue attribute request for - * @sb: superblock on which to create inode - * - */ -static inline struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) -{ - if (v9fs_proto_dotl(v9ses)) - return v9fs_inode_dotl(v9ses, fid, sb); - else - return v9fs_inode(v9ses, fid, sb); -} - /** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted @@ -626,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - - if (v9ses->cache) - dentry->d_op = &v9fs_cached_dentry_operations; - else - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) @@ -650,144 +555,6 @@ error: } /** - * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. - * @dir: directory inode that is being created - * @dentry: dentry that is being deleted - * @mode: create permissions - * @nd: path information - * - */ - -static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, - struct nameidata *nd) -{ - int err = 0; - char *name = NULL; - gid_t gid; - int flags; - mode_t mode; - struct v9fs_session_info *v9ses; - struct p9_fid *fid = NULL; - struct p9_fid *dfid, *ofid; - struct file *filp; - struct p9_qid qid; - struct inode *inode; - struct posix_acl *pacl = NULL, *dacl = NULL; - - v9ses = v9fs_inode2v9ses(dir); - if (nd && nd->flags & LOOKUP_OPEN) - flags = nd->intent.open.flags - 1; - else { - /* - * create call without LOOKUP_OPEN is due - * to mknod of regular files. So use mknod - * operation. - */ - return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); - } - - name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " - "mode:0x%x\n", name, flags, omode); - - dfid = v9fs_fid_lookup(dentry->d_parent); - if (IS_ERR(dfid)) { - err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - return err; - } - - /* clone a fid to use for creation */ - ofid = p9_client_walk(dfid, 0, NULL, 1); - if (IS_ERR(ofid)) { - err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - return err; - } - - gid = v9fs_get_fsgid_for_create(dir); - - mode = omode; - /* Update mode based on ACL value */ - err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); - if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in creat %d\n", err); - goto error; - } - err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); - if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_open_dotl failed in creat %d\n", - err); - goto error; - } - /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE || - (nd && nd->flags & LOOKUP_OPEN)) { - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - fid = NULL; - goto error; - } - - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - dentry->d_op = &v9fs_cached_dentry_operations; - d_instantiate(dentry, inode); - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - /* The fid would get clunked via a dput */ - fid = NULL; - } else { - /* - * Not in cached mode. No need to populate - * inode with stat. We need to get an inode - * so that we can set the acl with dentry - */ - inode = v9fs_get_inode(dir->i_sb, mode); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); - } - /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); - - /* if we are opening a file, assign the open fid to the file */ - if (nd && nd->flags & LOOKUP_OPEN) { - filp = lookup_instantiate_filp(nd, dentry, generic_file_open); - if (IS_ERR(filp)) { - p9_client_clunk(ofid); - return PTR_ERR(filp); - } - filp->private_data = ofid; - } else - p9_client_clunk(ofid); - - return 0; - -error: - if (ofid) - p9_client_clunk(ofid); - if (fid) - p9_client_clunk(fid); - return err; -} - -/** * v9fs_vfs_create - VFS hook to create files * @dir: directory inode that is being created * @dentry: dentry that is being deleted @@ -877,107 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return err; } - -/** - * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory - * @dir: inode that is being unlinked - * @dentry: dentry that is being unlinked - * @mode: mode for new directory - * - */ - -static int v9fs_vfs_mkdir_dotl(struct inode *dir, - struct dentry *dentry, int omode) -{ - int err; - struct v9fs_session_info *v9ses; - struct p9_fid *fid = NULL, *dfid = NULL; - gid_t gid; - char *name; - mode_t mode; - struct inode *inode; - struct p9_qid qid; - struct dentry *dir_dentry; - struct posix_acl *dacl = NULL, *pacl = NULL; - - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); - err = 0; - v9ses = v9fs_inode2v9ses(dir); - - omode |= S_IFDIR; - if (dir->i_mode & S_ISGID) - omode |= S_ISGID; - - dir_dentry = v9fs_dentry_from_dir_inode(dir); - dfid = v9fs_fid_lookup(dir_dentry); - if (IS_ERR(dfid)) { - err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - dfid = NULL; - goto error; - } - - gid = v9fs_get_fsgid_for_create(dir); - mode = omode; - /* Update mode based on ACL value */ - err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); - if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mkdir %d\n", err); - goto error; - } - name = (char *) dentry->d_name.name; - err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); - if (err < 0) - goto error; - - /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - fid = NULL; - goto error; - } - - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - dentry->d_op = &v9fs_cached_dentry_operations; - d_instantiate(dentry, inode); - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - fid = NULL; - } else { - /* - * Not in cached mode. No need to populate - * inode with stat. We need to get an inode - * so that we can set the acl with dentry - */ - inode = v9fs_get_inode(dir->i_sb, mode); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); - } - /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); - -error: - if (fid) - p9_client_clunk(fid); - return err; -} - /** * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from @@ -986,7 +652,7 @@ error: * */ -static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, +struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nameidata) { struct super_block *sb; @@ -1033,11 +699,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, goto error_iput; inst_out: - if (v9ses->cache) - dentry->d_op = &v9fs_cached_dentry_operations; - else - dentry->d_op = &v9fs_dentry_operations; - d_add(dentry, inode); return NULL; @@ -1056,7 +717,7 @@ error: * */ -static int v9fs_vfs_unlink(struct inode *i, struct dentry *d) +int v9fs_vfs_unlink(struct inode *i, struct dentry *d) { return v9fs_remove(i, d, 0); } @@ -1068,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d) * */ -static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) +int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) { return v9fs_remove(i, d, 1); } @@ -1082,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) * */ -static int +int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -1189,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int -v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - int err; - struct v9fs_session_info *v9ses; - struct p9_fid *fid; - struct p9_stat_dotl *st; - - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); - err = -EPERM; - v9ses = v9fs_inode2v9ses(dentry->d_inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - return simple_getattr(mnt, dentry, stat); - - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) - return PTR_ERR(fid); - - /* Ask for all the fields in stat structure. Server will return - * whatever it supports - */ - - st = p9_client_getattr_dotl(fid, P9_STATS_ALL); - if (IS_ERR(st)) - return PTR_ERR(st); - - v9fs_stat2inode_dotl(st, dentry->d_inode); - generic_fillattr(dentry->d_inode, stat); - /* Change block size to what the server returned */ - stat->blksize = st->st_blksize; - - kfree(st); - return 0; -} - /** * v9fs_vfs_setattr - set file metadata * @dentry: file whose metadata to set @@ -1284,64 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /** - * v9fs_vfs_setattr_dotl - set file metadata - * @dentry: file whose metadata to set - * @iattr: metadata assignment structure - * - */ - -int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) -{ - int retval; - struct v9fs_session_info *v9ses; - struct p9_fid *fid; - struct p9_iattr_dotl p9attr; - - P9_DPRINTK(P9_DEBUG_VFS, "\n"); - - retval = inode_change_ok(dentry->d_inode, iattr); - if (retval) - return retval; - - p9attr.valid = iattr->ia_valid; - p9attr.mode = iattr->ia_mode; - p9attr.uid = iattr->ia_uid; - p9attr.gid = iattr->ia_gid; - p9attr.size = iattr->ia_size; - p9attr.atime_sec = iattr->ia_atime.tv_sec; - p9attr.atime_nsec = iattr->ia_atime.tv_nsec; - p9attr.mtime_sec = iattr->ia_mtime.tv_sec; - p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; - - retval = -EPERM; - v9ses = v9fs_inode2v9ses(dentry->d_inode); - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) - return PTR_ERR(fid); - - retval = p9_client_setattr(fid, &p9attr); - if (retval < 0) - return retval; - - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(dentry->d_inode)) { - retval = vmtruncate(dentry->d_inode, iattr->ia_size); - if (retval) - return retval; - } - - setattr_copy(dentry->d_inode, iattr); - mark_inode_dirty(dentry->d_inode); - if (iattr->ia_valid & ATTR_MODE) { - /* We also want to update ACL when we update mode bits */ - retval = v9fs_acl_chmod(dentry); - if (retval < 0) - return retval; - } - return 0; -} - -/** * v9fs_stat2inode - populate an inode structure with mistat info * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate @@ -1419,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, } /** - * v9fs_stat2inode_dotl - populate an inode structure with stat info - * @stat: stat structure - * @inode: inode to populate - * @sb: superblock of filesystem - * - */ - -void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) -{ - - if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { - inode->i_atime.tv_sec = stat->st_atime_sec; - inode->i_atime.tv_nsec = stat->st_atime_nsec; - inode->i_mtime.tv_sec = stat->st_mtime_sec; - inode->i_mtime.tv_nsec = stat->st_mtime_nsec; - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; - inode->i_uid = stat->st_uid; - inode->i_gid = stat->st_gid; - inode->i_nlink = stat->st_nlink; - inode->i_mode = stat->st_mode; - inode->i_rdev = new_decode_dev(stat->st_rdev); - - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, inode->i_rdev); - - i_size_write(inode, stat->st_size); - inode->i_blocks = stat->st_blocks; - } else { - if (stat->st_result_mask & P9_STATS_ATIME) { - inode->i_atime.tv_sec = stat->st_atime_sec; - inode->i_atime.tv_nsec = stat->st_atime_nsec; - } - if (stat->st_result_mask & P9_STATS_MTIME) { - inode->i_mtime.tv_sec = stat->st_mtime_sec; - inode->i_mtime.tv_nsec = stat->st_mtime_nsec; - } - if (stat->st_result_mask & P9_STATS_CTIME) { - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; - } - if (stat->st_result_mask & P9_STATS_UID) - inode->i_uid = stat->st_uid; - if (stat->st_result_mask & P9_STATS_GID) - inode->i_gid = stat->st_gid; - if (stat->st_result_mask & P9_STATS_NLINK) - inode->i_nlink = stat->st_nlink; - if (stat->st_result_mask & P9_STATS_MODE) { - inode->i_mode = stat->st_mode; - if ((S_ISBLK(inode->i_mode)) || - (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - } - if (stat->st_result_mask & P9_STATS_RDEV) - inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); - if (stat->st_result_mask & P9_STATS_BLOCKS) - inode->i_blocks = stat->st_blocks; - } - if (stat->st_result_mask & P9_STATS_GEN) - inode->i_generation = stat->st_gen; - - /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION - * because the inode structure does not have fields for them. - */ -} - -/** * v9fs_qid2ino - convert qid into inode number * @qid: qid to hash * @@ -1595,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) * */ -static void +void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); @@ -1639,94 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, } /** - * v9fs_vfs_symlink_dotl - helper function to create symlinks - * @dir: directory inode containing symlink - * @dentry: dentry for symlink - * @symname: symlink data - * - * See Also: 9P2000.L RFC for more information - * - */ - -static int -v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct v9fs_session_info *v9ses; - struct p9_fid *dfid; - struct p9_fid *fid = NULL; - struct inode *inode; - struct p9_qid qid; - char *name; - int err; - gid_t gid; - - name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", - dir->i_ino, name, symname); - v9ses = v9fs_inode2v9ses(dir); - - dfid = v9fs_fid_lookup(dentry->d_parent); - if (IS_ERR(dfid)) { - err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - return err; - } - - gid = v9fs_get_fsgid_for_create(dir); - - /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ - err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); - - if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); - goto error; - } - - if (v9ses->cache) { - /* Now walk from the parent so we can get an unopened fid. */ - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - fid = NULL; - goto error; - } - - /* instantiate inode and assign the unopened fid to dentry */ - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - dentry->d_op = &v9fs_cached_dentry_operations; - d_instantiate(dentry, inode); - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - fid = NULL; - } else { - /* Not in cached mode. No need to populate inode with stat */ - inode = v9fs_get_inode(dir->i_sb, S_IFLNK); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); - } - -error: - if (fid) - p9_client_clunk(fid); - - return err; -} - -/** * v9fs_vfs_symlink - helper function to create symlinks * @dir: directory inode containing symlink * @dentry: dentry for symlink @@ -1785,77 +1193,6 @@ clunk_fid: } /** - * v9fs_vfs_link_dotl - create a hardlink for dotl - * @old_dentry: dentry for file to link to - * @dir: inode destination for new link - * @dentry: dentry for link - * - */ - -static int -v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - int err; - struct p9_fid *dfid, *oldfid; - char *name; - struct v9fs_session_info *v9ses; - struct dentry *dir_dentry; - - P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", - dir->i_ino, old_dentry->d_name.name, - dentry->d_name.name); - - v9ses = v9fs_inode2v9ses(dir); - dir_dentry = v9fs_dentry_from_dir_inode(dir); - dfid = v9fs_fid_lookup(dir_dentry); - if (IS_ERR(dfid)) - return PTR_ERR(dfid); - - oldfid = v9fs_fid_lookup(old_dentry); - if (IS_ERR(oldfid)) - return PTR_ERR(oldfid); - - name = (char *) dentry->d_name.name; - - err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); - - if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); - return err; - } - - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - /* Get the latest stat info from server. */ - struct p9_fid *fid; - struct p9_stat_dotl *st; - - fid = v9fs_fid_lookup(old_dentry); - if (IS_ERR(fid)) - return PTR_ERR(fid); - - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); - if (IS_ERR(st)) - return PTR_ERR(st); - - v9fs_stat2inode_dotl(st, old_dentry->d_inode); - - kfree(st); - } else { - /* Caching disabled. No need to get upto date stat info. - * This dentry will be released immediately. So, just hold the - * inode - */ - ihold(old_dentry->d_inode); - } - - dentry->d_op = old_dentry->d_op; - d_instantiate(dentry, old_dentry->d_inode); - - return err; -} - -/** * v9fs_vfs_mknod - create a special file * @dir: inode destination for new link * @dentry: dentry for file @@ -1900,160 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } -/** - * v9fs_vfs_mknod_dotl - create a special file - * @dir: inode destination for new link - * @dentry: dentry for file - * @mode: mode for creation - * @rdev: device associated with special file - * - */ -static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, - dev_t rdev) -{ - int err; - char *name; - mode_t mode; - struct v9fs_session_info *v9ses; - struct p9_fid *fid = NULL, *dfid = NULL; - struct inode *inode; - gid_t gid; - struct p9_qid qid; - struct dentry *dir_dentry; - struct posix_acl *dacl = NULL, *pacl = NULL; - - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); - - if (!new_valid_dev(rdev)) - return -EINVAL; - - v9ses = v9fs_inode2v9ses(dir); - dir_dentry = v9fs_dentry_from_dir_inode(dir); - dfid = v9fs_fid_lookup(dir_dentry); - if (IS_ERR(dfid)) { - err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - dfid = NULL; - goto error; - } - - gid = v9fs_get_fsgid_for_create(dir); - mode = omode; - /* Update mode based on ACL value */ - err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); - if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mknod %d\n", err); - goto error; - } - name = (char *) dentry->d_name.name; - - err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); - if (err < 0) - goto error; - - /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - fid = NULL; - goto error; - } - - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - dentry->d_op = &v9fs_cached_dentry_operations; - d_instantiate(dentry, inode); - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - fid = NULL; - } else { - /* - * Not in cached mode. No need to populate inode with stat. - * socket syscall returns a fd, so we need instantiate - */ - inode = v9fs_get_inode(dir->i_sb, mode); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - dentry->d_op = &v9fs_dentry_operations; - d_instantiate(dentry, inode); - } - /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); -error: - if (fid) - p9_client_clunk(fid); - return err; -} - -static int -v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen) -{ - int retval; - struct p9_fid *fid; - char *target = NULL; - - P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); - retval = -EPERM; - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) - return PTR_ERR(fid); - - retval = p9_client_readlink(fid, &target); - if (retval < 0) - return retval; - - strncpy(buffer, target, buflen); - P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer); - - retval = strnlen(buffer, buflen); - return retval; -} - -/** - * v9fs_vfs_follow_link_dotl - follow a symlink path - * @dentry: dentry for symlink - * @nd: nameidata - * - */ - -static void * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) -{ - int len = 0; - char *link = __getname(); - - P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); - - if (!link) - link = ERR_PTR(-ENOMEM); - else { - len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX); - if (len < 0) { - __putname(link); - link = ERR_PTR(len); - } else - link[min(len, PATH_MAX-1)] = 0; - } - nd_set_link(nd, link); - - return NULL; -} - static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, @@ -2068,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { .setattr = v9fs_vfs_setattr, }; -static const struct inode_operations v9fs_dir_inode_operations_dotl = { - .create = v9fs_vfs_create_dotl, - .lookup = v9fs_vfs_lookup, - .link = v9fs_vfs_link_dotl, - .symlink = v9fs_vfs_symlink_dotl, - .unlink = v9fs_vfs_unlink, - .mkdir = v9fs_vfs_mkdir_dotl, - .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod_dotl, - .rename = v9fs_vfs_rename, - .getattr = v9fs_vfs_getattr_dotl, - .setattr = v9fs_vfs_setattr_dotl, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, - .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, -}; - static const struct inode_operations v9fs_dir_inode_operations = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, @@ -2104,16 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = { .setattr = v9fs_vfs_setattr, }; -static const struct inode_operations v9fs_file_inode_operations_dotl = { - .getattr = v9fs_vfs_getattr_dotl, - .setattr = v9fs_vfs_setattr_dotl, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, - .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, -}; - static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, @@ -2122,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = { .setattr = v9fs_vfs_setattr, }; -static const struct inode_operations v9fs_symlink_inode_operations_dotl = { - .readlink = v9fs_vfs_readlink_dotl, - .follow_link = v9fs_vfs_follow_link_dotl, - .put_link = v9fs_vfs_put_link, - .getattr = v9fs_vfs_getattr_dotl, - .setattr = v9fs_vfs_setattr_dotl, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, - .listxattr = v9fs_listxattr, -}; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c new file mode 100644 index 00000000000..fe3ffa9aace --- /dev/null +++ b/fs/9p/vfs_inode_dotl.c @@ -0,0 +1,824 @@ +/* + * linux/fs/9p/vfs_inode_dotl.c + * + * This file contains vfs inode ops for the 9P2000.L protocol. + * + * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/pagemap.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/inet.h> +#include <linux/namei.h> +#include <linux/idr.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> + +#include "v9fs.h" +#include "v9fs_vfs.h" +#include "fid.h" +#include "cache.h" +#include "xattr.h" +#include "acl.h" + +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, + dev_t rdev); + +/** + * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * new file system object. This checks the S_ISGID to determine the owning + * group of the new file system object. + */ + +static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +{ + BUG_ON(dir_inode == NULL); + + if (dir_inode->i_mode & S_ISGID) { + /* set_gid bit is set.*/ + return dir_inode->i_gid; + } + return current_fsgid(); +} + +/** + * v9fs_dentry_from_dir_inode - helper function to get the dentry from + * dir inode. + * + */ + +static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + /* Directory should have only one entry. */ + BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); + dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); + spin_unlock(&inode->i_lock); + return dentry; +} + +struct inode * +v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + struct inode *ret = NULL; + int err; + struct p9_stat_dotl *st; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return ERR_CAST(st); + + ret = v9fs_get_inode(sb, st->st_mode); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + goto error; + } + + v9fs_stat2inode_dotl(st, ret); + ret->i_ino = v9fs_qid2ino(&st->qid); +#ifdef CONFIG_9P_FSCACHE + v9fs_vcookie_set_qid(ret, &st->qid); + v9fs_cache_inode_get_cookie(ret); +#endif + err = v9fs_get_acl(ret, fid); + if (err) { + iput(ret); + goto error; + } + kfree(st); + return ret; +error: + kfree(st); + return ERR_PTR(err); +} + +/** + * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ + +static int +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, + struct nameidata *nd) +{ + int err = 0; + char *name = NULL; + gid_t gid; + int flags; + mode_t mode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL; + struct p9_fid *dfid, *ofid; + struct file *filp; + struct p9_qid qid; + struct inode *inode; + struct posix_acl *pacl = NULL, *dacl = NULL; + + v9ses = v9fs_inode2v9ses(dir); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else { + /* + * create call without LOOKUP_OPEN is due + * to mknod of regular files. So use mknod + * operation. + */ + return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + } + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " + "mode:0x%x\n", name, flags, omode); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in creat %d\n", err); + goto error; + } + err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_open_dotl failed in creat %d\n", + err); + goto error; + } + + /* instantiate inode and assign the unopened fid to the dentry */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, dacl, pacl); + + /* Since we are opening a file, assign the open fid to the file */ + filp = lookup_instantiate_filp(nd, dentry, generic_file_open); + if (IS_ERR(filp)) { + p9_client_clunk(ofid); + return PTR_ERR(filp); + } + filp->private_data = ofid; + return 0; + +error: + if (ofid) + p9_client_clunk(ofid); + if (fid) + p9_client_clunk(fid); + return err; +} + +/** + * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir_dotl(struct inode *dir, + struct dentry *dentry, int omode) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + gid_t gid; + char *name; + mode_t mode; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + + omode |= S_IFDIR; + if (dir->i_mode & S_ISGID) + omode |= S_ISGID; + + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in mkdir %d\n", err); + goto error; + } + name = (char *) dentry->d_name.name; + err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate + * inode with stat. We need to get an inode + * so that we can set the acl with dentry + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, dacl, pacl); + +error: + if (fid) + p9_client_clunk(fid); + return err; +} + +static int +v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat_dotl *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + return simple_getattr(mnt, dentry, stat); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Ask for all the fields in stat structure. Server will return + * whatever it supports + */ + + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, dentry->d_inode); + generic_fillattr(dentry->d_inode, stat); + /* Change block size to what the server returned */ + stat->blksize = st->st_blksize; + + kfree(st); + return 0; +} + +/** + * v9fs_vfs_setattr_dotl - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + retval = inode_change_ok(dentry->d_inode, iattr); + if (retval) + return retval; + + p9attr.valid = iattr->ia_valid; + p9attr.mode = iattr->ia_mode; + p9attr.uid = iattr->ia_uid; + p9attr.gid = iattr->ia_gid; + p9attr.size = iattr->ia_size; + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + + retval = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + retval = p9_client_setattr(fid, &p9attr); + if (retval < 0) + return retval; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(dentry->d_inode)) { + retval = vmtruncate(dentry->d_inode, iattr->ia_size); + if (retval) + return retval; + } + + setattr_copy(dentry->d_inode, iattr); + mark_inode_dirty(dentry->d_inode); + if (iattr->ia_valid & ATTR_MODE) { + /* We also want to update ACL when we update mode bits */ + retval = v9fs_acl_chmod(dentry); + if (retval < 0) + return retval; + } + return 0; +} + +/** + * v9fs_stat2inode_dotl - populate an inode structure with stat info + * @stat: stat structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +{ + + if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode->i_uid = stat->st_uid; + inode->i_gid = stat->st_gid; + inode->i_nlink = stat->st_nlink; + inode->i_mode = stat->st_mode; + inode->i_rdev = new_decode_dev(stat->st_rdev); + + if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + + i_size_write(inode, stat->st_size); + inode->i_blocks = stat->st_blocks; + } else { + if (stat->st_result_mask & P9_STATS_ATIME) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + } + if (stat->st_result_mask & P9_STATS_MTIME) { + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + } + if (stat->st_result_mask & P9_STATS_CTIME) { + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + } + if (stat->st_result_mask & P9_STATS_UID) + inode->i_uid = stat->st_uid; + if (stat->st_result_mask & P9_STATS_GID) + inode->i_gid = stat->st_gid; + if (stat->st_result_mask & P9_STATS_NLINK) + inode->i_nlink = stat->st_nlink; + if (stat->st_result_mask & P9_STATS_MODE) { + inode->i_mode = stat->st_mode; + if ((S_ISBLK(inode->i_mode)) || + (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } + if (stat->st_result_mask & P9_STATS_RDEV) + inode->i_rdev = new_decode_dev(stat->st_rdev); + if (stat->st_result_mask & P9_STATS_SIZE) + i_size_write(inode, stat->st_size); + if (stat->st_result_mask & P9_STATS_BLOCKS) + inode->i_blocks = stat->st_blocks; + } + if (stat->st_result_mask & P9_STATS_GEN) + inode->i_generation = stat->st_gen; + + /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION + * because the inode structure does not have fields for them. + */ +} + +static int +v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct inode *inode; + struct p9_qid qid; + char *name; + int err; + gid_t gid; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", + dir->i_ino, name, symname); + v9ses = v9fs_inode2v9ses(dir); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ + err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + goto error; + } + + if (v9ses->cache) { + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* Not in cached mode. No need to populate inode with stat */ + inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** + * v9fs_vfs_link_dotl - create a hardlink for dotl + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err; + struct p9_fid *dfid, *oldfid; + char *name; + struct v9fs_session_info *v9ses; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, + dentry->d_name.name); + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) + return PTR_ERR(dfid); + + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = (char *) dentry->d_name.name; + + err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + return err; + } + + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Get the latest stat info from server. */ + struct p9_fid *fid; + struct p9_stat_dotl *st; + + fid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, old_dentry->d_inode); + + kfree(st); + } else { + /* Caching disabled. No need to get upto date stat info. + * This dentry will be released immediately. So, just hold the + * inode + */ + ihold(old_dentry->d_inode); + } + d_instantiate(dentry, old_dentry->d_inode); + + return err; +} + +/** + * v9fs_vfs_mknod_dotl - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, + dev_t rdev) +{ + int err; + char *name; + mode_t mode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + struct inode *inode; + gid_t gid; + struct p9_qid qid; + struct dentry *dir_dentry; + struct posix_acl *dacl = NULL, *pacl = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + mode = omode; + /* Update mode based on ACL value */ + err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, + "Failed to get acl values in mknod %d\n", err); + goto error; + } + name = (char *) dentry->d_name.name; + + err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate inode with stat. + * socket syscall returns a fd, so we need instantiate + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + d_instantiate(dentry, inode); + } + /* Now set the ACL based on the default value */ + v9fs_set_create_acl(dentry, dacl, pacl); +error: + if (fid) + p9_client_clunk(fid); + return err; +} + +/** + * v9fs_vfs_follow_link_dotl - follow a symlink path + * @dentry: dentry for symlink + * @nd: nameidata + * + */ + +static void * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +{ + int retval; + struct p9_fid *fid; + char *link = __getname(); + char *target; + + P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + + if (!link) { + link = ERR_PTR(-ENOMEM); + goto ndset; + } + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + __putname(link); + link = ERR_PTR(PTR_ERR(fid)); + goto ndset; + } + retval = p9_client_readlink(fid, &target); + if (!retval) { + strcpy(link, target); + kfree(target); + goto ndset; + } + __putname(link); + link = ERR_PTR(retval); +ndset: + nd_set_link(nd, link); + return NULL; +} + +const struct inode_operations v9fs_dir_inode_operations_dotl = { + .create = v9fs_vfs_create_dotl, + .lookup = v9fs_vfs_lookup, + .link = v9fs_vfs_link_dotl, + .symlink = v9fs_vfs_symlink_dotl, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir_dotl, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod_dotl, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + .check_acl = v9fs_check_acl, +}; + +const struct inode_operations v9fs_file_inode_operations_dotl = { + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + .check_acl = v9fs_check_acl, +}; + +const struct inode_operations v9fs_symlink_inode_operations_dotl = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link_dotl, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, +}; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index c55c614500a..dbaabe3b813 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } v9fs_fill_super(sb, v9ses, flags, data); + if (v9ses->cache) + sb->s_d_op = &v9fs_cached_dentry_operations; + else + sb->s_d_op = &v9fs_dentry_operations; + inode = v9fs_get_inode(sb, S_IFDIR | mode); if (IS_ERR(inode)) { retval = PTR_ERR(inode); @@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); - if (s->s_root) - v9fs_dentry_release(s->s_root); /* clunk root */ - kill_anon_super(s); v9fs_session_cancel(v9ses); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 43ec7df8433..d288773871b 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -133,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, "p9_client_xattrcreate failed %d\n", retval); goto error; } - msize = fid->clnt->msize;; + msize = fid->clnt->msize; while (value_len) { if (value_len > (msize - P9_IOHDRSZ)) write_count = msize - P9_IOHDRSZ; diff --git a/fs/Kconfig b/fs/Kconfig index 771f457402d..3db9caa57ed 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -30,15 +30,6 @@ config FS_MBCACHE source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" -config FS_POSIX_ACL -# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4) -# -# NOTE: you can implement Posix ACLs without these helpers (XFS does). -# Never use this symbol for ifdefs. -# - bool - default n - source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" @@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +# Posix ACL utility routines +# +# Note: Posix ACLs can be implemented without these helpers. Never use +# this symbol for ifdefs in core code. +# +config FS_POSIX_ACL + def_bool n + config EXPORTFS tristate config FILE_LOCKING - bool "Enable POSIX file locking API" if EMBEDDED + bool "Enable POSIX file locking API" if EXPERT default y help This option enables standard file locking support, required diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index f4287e4de74..3b4a764ed78 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = { }; static int -adfs_hash(struct dentry *parent, struct qstr *qstr) +adfs_hash(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr) { const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; const unsigned char *name; @@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr) * requirements of the underlying filesystem. */ static int -adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name) +adfs_compare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { int i; - if (entry->len != name->len) + if (len != name->len) return 1; for (i = 0; i < name->len; i++) { char a, b; - a = entry->name[i]; + a = str[i]; b = name->name[i]; if (a >= 'A' && a <= 'Z') @@ -273,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) struct object_info obj; int error; - dentry->d_op = &adfs_dentry_operations; lock_kernel(); error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); if (error == 0) { diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 959dbff2d42..2d7954049fb 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void adfs_destroy_inode(struct inode *inode) +static void adfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } +static void adfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, adfs_i_callback); +} + static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; @@ -466,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb->s_namelen = ADFS_F_NAME_LEN; } + sb->s_d_op = &adfs_dentry_operations; root = adfs_iget(sb, &root_obj); sb->s_root = d_alloc_root(root); if (!sb->s_root) { @@ -476,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) kfree(asb->s_map); adfs_error(sb, "get root inode failed\n"); goto error; - } else - sb->s_root->d_op = &adfs_dentry_operations; + } unlock_kernel(); return 0; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index a8cbdeb3402..0e95f73a702 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -201,6 +201,7 @@ extern const struct address_space_operations affs_aops; extern const struct address_space_operations affs_aops_ofs; extern const struct dentry_operations affs_dentry_operations; +extern const struct dentry_operations affs_intl_dentry_operations; static inline void affs_set_blocksize(struct super_block *sb, int size) diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 7d0f0a30f7a..3a4557e8325 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino) void *data = dentry->d_fsdata; struct list_head *head, *next; - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); head = &inode->i_dentry; next = head->next; while (next != head) { @@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino) } next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 914d1c0bc07..e3e9efc1fdd 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -13,18 +13,26 @@ typedef int (*toupper_t)(int); static int affs_toupper(int ch); -static int affs_hash_dentry(struct dentry *, struct qstr *); -static int affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +static int affs_hash_dentry(const struct dentry *, + const struct inode *, struct qstr *); +static int affs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); static int affs_intl_toupper(int ch); -static int affs_intl_hash_dentry(struct dentry *, struct qstr *); -static int affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +static int affs_intl_hash_dentry(const struct dentry *, + const struct inode *, struct qstr *); +static int affs_intl_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); const struct dentry_operations affs_dentry_operations = { .d_hash = affs_hash_dentry, .d_compare = affs_compare_dentry, }; -static const struct dentry_operations affs_intl_dentry_operations = { +const struct dentry_operations affs_intl_dentry_operations = { .d_hash = affs_intl_hash_dentry, .d_compare = affs_intl_compare_dentry, }; @@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name,qstr->len); + i = affs_check_name(qstr->name, qstr->len); if (i) return i; @@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper) } static int -affs_hash_dentry(struct dentry *dentry, struct qstr *qstr) +affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { - return __affs_hash_dentry(dentry, qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper); } static int -affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr) +affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { - return __affs_hash_dentry(dentry, qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper); } -static inline int -__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper) +static inline int __affs_compare_dentry(unsigned int len, + const char *str, const struct qstr *name, toupper_t toupper) { - const u8 *aname = a->name; - const u8 *bname = b->name; - int len; + const u8 *aname = str; + const u8 *bname = name->name; - /* 'a' is the qstr of an already existing dentry, so the name - * must be valid. 'b' must be validated first. + /* + * 'str' is the name of an already existing dentry, so the name + * must be valid. 'name' must be validated first. */ - if (affs_check_name(b->name,b->len)) + if (affs_check_name(name->name, name->len)) return 1; - /* If the names are longer than the allowed 30 chars, + /* + * If the names are longer than the allowed 30 chars, * the excess is ignored, so their length may differ. */ - len = a->len; if (len >= 30) { - if (b->len < 30) + if (name->len < 30) return 1; len = 30; - } else if (len != b->len) + } else if (len != name->len) return 1; for (; len > 0; len--) @@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou } static int -affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(dentry, a, b, affs_toupper); + return __affs_compare_dentry(len, str, name, affs_toupper); } static int -affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(dentry, a, b, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper); } /* @@ -226,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (IS_ERR(inode)) return ERR_CAST(inode); } - dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; d_add(dentry, inode); return NULL; } diff --git a/fs/affs/super.c b/fs/affs/super.c index 0cf7f4384cb..b31507d0f9b 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb) return &i->vfs_inode; } -static void affs_destroy_inode(struct inode *inode) +static void affs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } +static void affs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, affs_i_callback); +} + static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; @@ -470,12 +477,16 @@ got_root: goto out_error_noinode; } + if (AFFS_SB(sb)->s_flags & SF_INTL) + sb->s_d_op = &affs_intl_dentry_operations; + else + sb->s_d_op = &affs_dentry_operations; + sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) { printk(KERN_ERR "AFFS: Get root inode failed\n"); goto out_error; } - sb->s_root->d_op = &affs_dentry_operations; pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); return 0; diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index a3bcec75c54..1c8c6cc6de3 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, call->server = server; INIT_WORK(&call->work, SRXAFSCB_CallBack); - schedule_work(&call->work); + queue_work(afs_wq, &call->work); return 0; } @@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, call->server = server; INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); - schedule_work(&call->work); + queue_work(afs_wq, &call->work); return 0; } @@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, call->server = server; INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); - schedule_work(&call->work); + queue_work(afs_wq, &call->work); return 0; } @@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_Probe); - schedule_work(&call->work); + queue_work(afs_wq, &call->work); return 0; } @@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); - schedule_work(&call->work); + queue_work(afs_wq, &call->work); return 0; } @@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); - schedule_work(&call->work); + queue_work(afs_wq, &call->work); return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 5439e1bc9a8..20c106f2492 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/namei.h> #include <linux/pagemap.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); -static int afs_d_delete(struct dentry *dentry); +static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); @@ -61,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = { .setattr = afs_setattr, }; -static const struct dentry_operations afs_fs_dentry_operations = { +const struct dentry_operations afs_fs_dentry_operations = { .d_revalidate = afs_d_revalidate, .d_delete = afs_d_delete, .d_release = afs_d_release, + .d_automount = afs_d_automount, }; #define AFS_DIR_HASHTBL_SIZE 128 @@ -581,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, } success: - dentry->d_op = &afs_fs_dentry_operations; - d_add(dentry, inode); _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", fid.vnode, @@ -607,6 +607,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) void *dir_version; int ret; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + vnode = AFS_FS_I(dentry->d_inode); if (dentry->d_inode) @@ -730,7 +733,7 @@ out_bad: * - called from dput() when d_count is going to 0. * - return 1 to request dentry be unhashed, 0 otherwise */ -static int afs_d_delete(struct dentry *dentry) +static int afs_d_delete(const struct dentry *dentry) { _enter("%s", dentry->d_name.name); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0747339011c..db66c520147 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_generation = 0; set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - inode->i_flags |= S_NOATIME; + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; unlock_new_inode(inode); _leave(" = %p", inode); return inode; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index cca8eef736f..5a9b6843bac 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *); * dir.c */ extern const struct inode_operations afs_dir_inode_operations; +extern const struct dentry_operations afs_fs_dentry_operations; extern const struct file_operations afs_dir_file_operations; /* @@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *); /* * main.c */ +extern struct workqueue_struct *afs_wq; extern struct afs_uuid afs_uuid; /* @@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations; extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; +extern struct vfsmount *afs_d_automount(struct path *); extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); extern void afs_mntpt_kill_timer(void); @@ -624,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, long); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); -extern int afs_permission(struct inode *, int); +extern int afs_permission(struct inode *, int, unsigned int); /* * server.c diff --git a/fs/afs/main.c b/fs/afs/main.c index cfd1cbe25b2..42dd2e499ed 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -30,6 +30,7 @@ module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct afs_uuid afs_uuid; +struct workqueue_struct *afs_wq; /* * get a client UUID @@ -87,10 +88,16 @@ static int __init afs_init(void) if (ret < 0) return ret; + /* create workqueue */ + ret = -ENOMEM; + afs_wq = alloc_workqueue("afs", 0, 0); + if (!afs_wq) + return ret; + /* register the /proc stuff */ ret = afs_proc_init(); if (ret < 0) - return ret; + goto error_proc; #ifdef CONFIG_AFS_FSCACHE /* we want to be able to cache */ @@ -140,6 +147,8 @@ error_cell_init: error_cache: #endif afs_proc_cleanup(); +error_proc: + destroy_workqueue(afs_wq); rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); return ret; @@ -163,7 +172,7 @@ static void __exit afs_exit(void) afs_purge_servers(); afs_callback_update_kill(); afs_vlocation_purge(); - flush_scheduled_work(); + destroy_workqueue(afs_wq); afs_cell_purge(); #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 6153417caf5..aa59184151d 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); static int afs_mntpt_open(struct inode *inode, struct file *file); -static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd); static void afs_mntpt_expiry_timed_out(struct work_struct *work); const struct file_operations afs_mntpt_file_operations = { @@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = { const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, - .follow_link = afs_mntpt_follow_link, .readlink = page_readlink, .getattr = afs_getattr, }; const struct inode_operations afs_autocell_inode_operations = { - .follow_link = afs_mntpt_follow_link, .getattr = afs_getattr, }; @@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) _debug("symlink is a mountpoint"); spin_lock(&vnode->lock); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + vnode->vfs_inode.i_flags |= S_AUTOMOUNT; spin_unlock(&vnode->lock); } @@ -238,52 +236,24 @@ error_no_devname: } /* - * follow a link from a mountpoint directory, thus causing it to be mounted + * handle an automount point */ -static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) +struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - int err; - _enter("%p{%s},{%s:%p{%s},}", - dentry, - dentry->d_name.name, - nd->path.mnt->mnt_devname, - dentry, - nd->path.dentry->d_name.name); - - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); + _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name); - newmnt = afs_mntpt_do_automount(nd->path.dentry); - if (IS_ERR(newmnt)) { - path_put(&nd->path); - return (void *)newmnt; - } - - mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts); - switch (err) { - case 0: - path_put(&nd->path); - nd->path.mnt = newmnt; - nd->path.dentry = dget(newmnt->mnt_root); - schedule_delayed_work(&afs_mntpt_expiry_timer, - afs_mntpt_expiry_timeout * HZ); - break; - case -EBUSY: - /* someone else made a mount here whilst we were busy */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) - ; - err = 0; - default: - mntput(newmnt); - break; - } + newmnt = afs_mntpt_do_automount(path->dentry); + if (IS_ERR(newmnt)) + return newmnt; - _leave(" = %d", err); - return ERR_PTR(err); + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &afs_vfsmounts); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); + _leave(" = %p {%s}", newmnt, newmnt->mnt_devname); + return newmnt; } /* @@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work) if (!list_empty(&afs_vfsmounts)) { mark_mounts_for_expiry(&afs_vfsmounts); - schedule_delayed_work(&afs_mntpt_expiry_timer, - afs_mntpt_expiry_timeout * HZ); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); } _leave(""); @@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void) _enter(""); ASSERT(list_empty(&afs_vfsmounts)); - cancel_delayed_work(&afs_mntpt_expiry_timer); - flush_scheduled_work(); + cancel_delayed_work_sync(&afs_mntpt_expiry_timer); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 654d8fdbf01..e45a323aebb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, if (!call) { /* its an incoming call for our callback service */ skb_queue_tail(&afs_incoming_calls, skb); - schedule_work(&afs_collect_incoming_call_work); + queue_work(afs_wq, &afs_collect_incoming_call_work); } else { /* route the messages directly to the appropriate call */ skb_queue_tail(&call->rx_queue, skb); diff --git a/fs/afs/security.c b/fs/afs/security.c index bb4ed144d0e..f44b9d35537 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask) +int afs_permission(struct inode *inode, int mask, unsigned int flags) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t uninitialized_var(access); struct key *key; int ret; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + _enter("{{%x:%u},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); @@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask) } key_put(key); - ret = generic_permission(inode, mask, NULL); + ret = generic_permission(inode, mask, flags, NULL); _leave(" = %d", ret); return ret; diff --git a/fs/afs/server.c b/fs/afs/server.c index 9fdc7fe3a7b..d59b7516e94 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server) if (atomic_read(&server->usage) == 0) { list_move_tail(&server->grave, &afs_server_graveyard); server->time_of_death = get_seconds(); - schedule_delayed_work(&afs_server_reaper, - afs_server_timeout * HZ); + queue_delayed_work(afs_wq, &afs_server_reaper, + afs_server_timeout * HZ); } spin_unlock(&afs_server_graveyard_lock); _leave(" [dead]"); @@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work) expiry = server->time_of_death + afs_server_timeout; if (expiry > now) { delay = (expiry - now) * HZ; - if (!schedule_delayed_work(&afs_server_reaper, delay)) { + if (!queue_delayed_work(afs_wq, &afs_server_reaper, + delay)) { cancel_delayed_work(&afs_server_reaper); - schedule_delayed_work(&afs_server_reaper, - delay); + queue_delayed_work(afs_wq, &afs_server_reaper, + delay); } break; } @@ -323,5 +324,5 @@ void __exit afs_purge_servers(void) { afs_server_timeout = 0; cancel_delayed_work(&afs_server_reaper); - schedule_delayed_work(&afs_server_reaper, 0); + queue_delayed_work(afs_wq, &afs_server_reaper, 0); } diff --git a/fs/afs/super.c b/fs/afs/super.c index 27201cffece..fb240e8766d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data) if (!root) goto error; + sb->s_d_op = &afs_fs_dentry_operations; sb->s_root = root; _leave(" = 0"); @@ -498,6 +499,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb) return &vnode->vfs_inode; } +static void afs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct afs_vnode *vnode = AFS_FS_I(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(afs_inode_cachep, vnode); +} + /* * destroy an AFS inode struct */ @@ -511,7 +520,7 @@ static void afs_destroy_inode(struct inode *inode) ASSERTCMP(vnode->server, ==, NULL); - kmem_cache_free(afs_inode_cachep, vnode); + call_rcu(&inode->i_rcu, afs_i_callback); atomic_dec(&afs_count_active_inodes); } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 9ac260d1361..431984d2e37 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl) _debug("buried"); list_move_tail(&vl->grave, &afs_vlocation_graveyard); vl->time_of_death = get_seconds(); - schedule_delayed_work(&afs_vlocation_reap, - afs_vlocation_timeout * HZ); + queue_delayed_work(afs_wq, &afs_vlocation_reap, + afs_vlocation_timeout * HZ); /* suspend updates on this record */ if (!list_empty(&vl->update)) { @@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work) if (expiry > now) { delay = (expiry - now) * HZ; _debug("delay %lu", delay); - if (!schedule_delayed_work(&afs_vlocation_reap, - delay)) { + if (!queue_delayed_work(afs_wq, &afs_vlocation_reap, + delay)) { cancel_delayed_work(&afs_vlocation_reap); - schedule_delayed_work(&afs_vlocation_reap, - delay); + queue_delayed_work(afs_wq, &afs_vlocation_reap, + delay); } break; } @@ -620,7 +620,7 @@ void afs_vlocation_purge(void) destroy_workqueue(afs_vlocation_update_worker); cancel_delayed_work(&afs_vlocation_reap); - schedule_delayed_work(&afs_vlocation_reap, 0); + queue_delayed_work(afs_wq, &afs_vlocation_reap, 0); } /* @@ -87,7 +87,7 @@ static int __init aio_setup(void) aio_wq = create_workqueue("aio"); abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); - BUG_ON(!abe_pool); + BUG_ON(!aio_wq || !abe_pool); pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); @@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx) queue_delayed_work(aio_wq, &ctx->wq, timeout); } - -/* - * aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static inline void aio_run_iocbs(struct kioctx *ctx) -{ - int requeue; - - spin_lock_irq(&ctx->ctx_lock); - - requeue = __aio_run_iocbs(ctx); - spin_unlock_irq(&ctx->ctx_lock); - if (requeue) - aio_queue_work(ctx); -} - /* - * just like aio_run_iocbs, but keeps running them until - * the list stays empty + * aio_run_all_iocbs: + * Process all pending retries queued on the ioctx + * run list, and keep running them until the list + * stays empty. + * Assumes it is operating within the aio issuer's mm context. */ static inline void aio_run_all_iocbs(struct kioctx *ctx) { @@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long ret = -EINVAL; if (likely(ioctx)) { - if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) + if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, timeout); put_ioctx(ioctx); } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 57ce55b2564..c5567cb7843 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly; static struct inode *anon_inode_inode; static const struct file_operations anon_inode_fops; -static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC); -} - /* * anon_inodefs_dname() is called from d_path(). */ @@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) dentry->d_name.name); } +static const struct dentry_operations anon_inodefs_dentry_operations = { + .d_dname = anon_inodefs_dname, +}; + +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "anon_inode:", NULL, + &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); +} + static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", .mount = anon_inodefs_mount, .kill_sb = kill_anon_super, }; -static const struct dentry_operations anon_inodefs_dentry_operations = { - .d_dname = anon_inodefs_dname, -}; /* * nop .set_page_dirty method so that people can use .page_mkwrite on @@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = { }; /** - * anon_inode_getfd - creates a new file instance by hooking it up to an - * anonymous inode, and a dentry that describe the "class" - * of the file + * anon_inode_getfile - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -102,7 +104,7 @@ struct file *anon_inode_getfile(const char *name, this.name = name; this.len = strlen(name); this.hash = 0; - path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); if (!path.dentry) goto err_module; @@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name, */ ihold(anon_inode_inode); - path.dentry->d_op = &anon_inodefs_dentry_operations; d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 3d283abf67d..54f92379272 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -16,6 +16,7 @@ #include <linux/auto_fs4.h> #include <linux/auto_dev-ioctl.h> #include <linux/mutex.h> +#include <linux/spinlock.h> #include <linux/list.h> /* This is the range of ioctl() numbers we claim as ours */ @@ -60,6 +61,8 @@ do { \ current->pid, __func__, ##args); \ } while (0) +extern spinlock_t autofs4_lock; + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never @@ -85,18 +88,9 @@ struct autofs_info { uid_t uid; gid_t gid; - - mode_t mode; - size_t size; - - void (*free)(struct autofs_info *); - union { - const char *symlink; - } u; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ -#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ struct autofs_wait_queue { @@ -173,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry) return 0; } -static inline void autofs4_copy_atime(struct file *src, struct file *dst) -{ - dst->f_path.dentry->d_inode->i_atime = - src->f_path.dentry->d_inode->i_atime; - return; -} - -struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *); +struct inode *autofs4_get_inode(struct super_block *, mode_t); void autofs4_free_ino(struct autofs_info *); /* Expiration */ @@ -209,16 +196,89 @@ void autofs_dev_ioctl_exit(void); extern const struct inode_operations autofs4_symlink_inode_operations; extern const struct inode_operations autofs4_dir_inode_operations; -extern const struct inode_operations autofs4_root_inode_operations; -extern const struct inode_operations autofs4_indirect_root_inode_operations; -extern const struct inode_operations autofs4_direct_root_inode_operations; extern const struct file_operations autofs4_dir_operations; extern const struct file_operations autofs4_root_operations; +extern const struct dentry_operations autofs4_dentry_operations; + +/* VFS automount flags management functions */ + +static inline void __managed_dentry_set_automount(struct dentry *dentry) +{ + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; +} + +static inline void managed_dentry_set_automount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_automount(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_automount(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT; +} + +static inline void managed_dentry_clear_automount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_automount(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_set_transit(struct dentry *dentry) +{ + dentry->d_flags |= DCACHE_MANAGE_TRANSIT; +} + +static inline void managed_dentry_set_transit(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_transit(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_transit(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT; +} + +static inline void managed_dentry_clear_transit(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_transit(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_set_managed(struct dentry *dentry) +{ + dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_set_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_managed(struct dentry *dentry) +{ + dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_clear_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_managed(dentry); + spin_unlock(&dentry->d_lock); +} /* Initializing function */ int autofs4_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode); +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); +void autofs4_clean_ino(struct autofs_info *); /* Queue management functions */ @@ -226,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); -static inline int autofs4_follow_mount(struct path *path) -{ - int res = 0; - - while (d_mountpoint(path->dentry)) { - int followed = follow_down(path); - if (!followed) - break; - res = 1; - } - return res; -} - static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); @@ -254,17 +301,15 @@ static inline int simple_positive(struct dentry *dentry) return dentry->d_inode && !d_unhashed(dentry); } -static inline int __simple_empty(struct dentry *dentry) +static inline void __autofs4_add_expiring(struct dentry *dentry) { - struct dentry *child; - int ret = 0; - - list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) - if (simple_positive(child)) - goto out; - ret = 1; -out: - return ret; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + } + return; } static inline void autofs4_add_expiring(struct dentry *dentry) @@ -293,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) return; } -void autofs4_dentry_release(struct dentry *); extern void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index eff9a419469..1442da4860e 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); - if (follow_down(&path)) + if (follow_down_one(&path)) magic = path.mnt->mnt_sb->s_magic; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index a796c9417fb..f43100b9662 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry, if (ino == NULL) return 0; - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - return 0; - if (!do_now) { /* Too young to die */ if (!timeout || time_after(ino->last_used + timeout, now)) @@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) path_get(&path); - if (!follow_down(&path)) + if (!follow_down_one(&path)) goto done; if (is_autofs4_dentry(path.dentry)) { @@ -91,24 +87,64 @@ done: } /* - * Calculate next entry in top down tree traversal. - * From next_mnt in namespace.c - elegant. + * Calculate and dget next entry in top down tree traversal. */ -static struct dentry *next_dentry(struct dentry *p, struct dentry *root) +static struct dentry *get_next_positive_dentry(struct dentry *prev, + struct dentry *root) { - struct list_head *next = p->d_subdirs.next; + struct list_head *next; + struct dentry *p, *ret; + + if (prev == NULL) + return dget(root); + spin_lock(&autofs4_lock); +relock: + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_subdirs.next; if (next == &p->d_subdirs) { while (1) { - if (p == root) + struct dentry *parent; + + if (p == root) { + spin_unlock(&p->d_lock); + spin_unlock(&autofs4_lock); + dput(prev); return NULL; + } + + parent = p->d_parent; + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&p->d_lock); + cpu_relax(); + goto relock; + } + spin_unlock(&p->d_lock); next = p->d_u.d_child.next; - if (next != &p->d_parent->d_subdirs) + p = parent; + if (next != &parent->d_subdirs) break; - p = p->d_parent; } } - return list_entry(next, struct dentry, d_u.d_child); + ret = list_entry(next, struct dentry, d_u.d_child); + + spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(ret)) { + spin_unlock(&p->d_lock); + p = ret; + goto again; + } + dget_dlock(ret); + spin_unlock(&ret->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&autofs4_lock); + + dput(prev); + + return ret; } /* @@ -158,18 +194,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt, if (!simple_positive(top)) return 1; - spin_lock(&dcache_lock); - for (p = top; p; p = next_dentry(p, top)) { - /* Negative dentry - give up */ - if (!simple_positive(p)) - continue; - + p = NULL; + while ((p = get_next_positive_dentry(p, top))) { DPRINTK("dentry %p %.*s", p, (int) p->d_name.len, p->d_name.name); - p = dget(p); - spin_unlock(&dcache_lock); - /* * Is someone visiting anywhere in the subtree ? * If there's no mount we need to check the usage @@ -198,16 +227,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt, else ino_count++; - if (atomic_read(&p->d_count) > ino_count) { + if (p->d_count > ino_count) { top_ino->last_used = jiffies; dput(p); return 1; } } - dput(p); - spin_lock(&dcache_lock); } - spin_unlock(&dcache_lock); /* Timeout of a tree mount is ultimately determined by its top dentry */ if (!autofs4_can_expire(top, timeout, do_now)) @@ -226,32 +252,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, DPRINTK("parent %p %.*s", parent, (int)parent->d_name.len, parent->d_name.name); - spin_lock(&dcache_lock); - for (p = parent; p; p = next_dentry(p, parent)) { - /* Negative dentry - give up */ - if (!simple_positive(p)) - continue; - + p = NULL; + while ((p = get_next_positive_dentry(p, parent))) { DPRINTK("dentry %p %.*s", p, (int) p->d_name.len, p->d_name.name); - p = dget(p); - spin_unlock(&dcache_lock); - if (d_mountpoint(p)) { /* Can we umount this guy */ if (autofs4_mount_busy(mnt, p)) - goto cont; + continue; /* Can we expire this guy */ if (autofs4_can_expire(p, timeout, do_now)) return p; } -cont: - dput(p); - spin_lock(&dcache_lock); } - spin_unlock(&dcache_lock); return NULL; } @@ -264,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, unsigned long timeout; struct dentry *root = dget(sb->s_root); int do_now = how & AUTOFS_EXP_IMMEDIATE; + struct autofs_info *ino; if (!root) return NULL; @@ -272,17 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, timeout = sbi->exp_timeout; spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + return NULL; + } + managed_dentry_set_transit(root); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { struct autofs_info *ino = autofs4_dentry_ino(root); - if (d_mountpoint(root)) { - ino->flags |= AUTOFS_INF_MOUNTPOINT; - root->d_mounted--; - } ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } + managed_dentry_clear_transit(root); spin_unlock(&sbi->fs_lock); dput(root); @@ -302,8 +322,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, { unsigned long timeout; struct dentry *root = sb->s_root; + struct dentry *dentry; struct dentry *expired = NULL; - struct list_head *next; int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; struct autofs_info *ino; @@ -315,25 +335,14 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - spin_lock(&dcache_lock); - next = root->d_subdirs.next; - - /* On exit from the loop expire is set to a dgot dentry - * to expire or it's NULL */ - while ( next != &root->d_subdirs ) { - struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); - - /* Negative dentry - give up */ - if (!simple_positive(dentry)) { - next = next->next; - continue; - } - - dentry = dget(dentry); - spin_unlock(&dcache_lock); - + dentry = NULL; + while ((dentry = get_next_positive_dentry(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + goto cont; + managed_dentry_set_transit(dentry); /* * Case 1: (i) indirect mount or top level pseudo direct mount @@ -347,7 +356,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 2; - if (atomic_read(&dentry->d_count) > ino_count) + if (dentry->d_count > ino_count) goto next; /* Can we umount this guy */ @@ -369,7 +378,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, if (!exp_leaves) { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (atomic_read(&dentry->d_count) > ino_count) + if (dentry->d_count > ino_count) goto next; if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { @@ -383,7 +392,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } else { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (atomic_read(&dentry->d_count) > ino_count) + if (dentry->d_count > ino_count) goto next; expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); @@ -393,12 +402,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } } next: + managed_dentry_clear_transit(dentry); +cont: spin_unlock(&sbi->fs_lock); - dput(dentry); - spin_lock(&dcache_lock); - next = next->next; } - spin_unlock(&dcache_lock); return NULL; found: @@ -408,9 +415,13 @@ found: ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&expired->d_parent->d_lock); + spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); - spin_unlock(&dcache_lock); + spin_unlock(&expired->d_lock); + spin_unlock(&expired->d_parent->d_lock); + spin_unlock(&autofs4_lock); return expired; } @@ -473,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); ino->flags &= ~AUTOFS_INF_EXPIRING; + if (!d_unhashed(dentry)) + managed_dentry_clear_transit(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -498,11 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_MOUNTPOINT) { - sb->s_root->d_mounted++; - ino->flags &= ~AUTOFS_INF_MOUNTPOINT; - } ino->flags &= ~AUTOFS_INF_EXPIRING; + spin_lock(&dentry->d_lock); + if (ret) + __managed_dentry_clear_transit(dentry); + else { + if ((IS_ROOT(dentry) || + (autofs_type_indirect(sbi->type) && + IS_ROOT(dentry->d_parent))) && + !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + __managed_dentry_set_automount(dentry); + } + spin_unlock(&dentry->d_lock); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index ac87e49fa70..180fa2425e4 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -22,77 +22,27 @@ #include "autofs_i.h" #include <linux/module.h> -static void ino_lnkfree(struct autofs_info *ino) +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) { - if (ino->u.symlink) { - kfree(ino->u.symlink); - ino->u.symlink = NULL; - } -} - -struct autofs_info *autofs4_init_ino(struct autofs_info *ino, - struct autofs_sb_info *sbi, mode_t mode) -{ - int reinit = 1; - - if (ino == NULL) { - reinit = 0; - ino = kmalloc(sizeof(*ino), GFP_KERNEL); - } - - if (ino == NULL) - return NULL; - - if (!reinit) { - ino->flags = 0; - ino->inode = NULL; - ino->dentry = NULL; - ino->size = 0; + struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); + if (ino) { INIT_LIST_HEAD(&ino->active); - ino->active_count = 0; INIT_LIST_HEAD(&ino->expiring); - atomic_set(&ino->count, 0); + ino->last_used = jiffies; + ino->sbi = sbi; } + return ino; +} +void autofs4_clean_ino(struct autofs_info *ino) +{ ino->uid = 0; ino->gid = 0; - ino->mode = mode; ino->last_used = jiffies; - - ino->sbi = sbi; - - if (reinit && ino->free) - (ino->free)(ino); - - memset(&ino->u, 0, sizeof(ino->u)); - - ino->free = NULL; - - if (S_ISLNK(mode)) - ino->free = ino_lnkfree; - - return ino; } void autofs4_free_ino(struct autofs_info *ino) { - struct autofs_info *p_ino; - - if (ino->dentry) { - ino->dentry->d_fsdata = NULL; - if (ino->dentry->d_inode) { - struct dentry *parent = ino->dentry->d_parent; - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(parent); - if (p_ino && parent != ino->dentry) - atomic_dec(&p_ino->count); - } - dput(ino->dentry); - } - ino->dentry = NULL; - } - if (ino->free) - (ino->free)(ino); kfree(ino); } @@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } +static void autofs4_evict_inode(struct inode *inode) +{ + end_writeback(inode); + kfree(inode->i_private); +} + static const struct super_operations autofs4_sops = { .statfs = simple_statfs, .show_options = autofs4_show_options, + .evict_inode = autofs4_evict_inode, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, @@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, return (*pipefd < 0); } -static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi) -{ - struct autofs_info *ino; - - ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755); - if (!ino) - return NULL; - - return ino; -} - -static const struct dentry_operations autofs4_sb_dentry_operations = { - .d_release = autofs4_dentry_release, -}; - int autofs4_fill_super(struct super_block *s, void *data, int silent) { struct inode * root_inode; @@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs4_sops; + s->s_d_op = &autofs4_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ - ino = autofs4_mkroot(sbi); + ino = autofs4_new_ino(sbi); if (!ino) goto fail_free; - root_inode = autofs4_get_inode(s, ino); + root_inode = autofs4_get_inode(s, S_IFDIR | 0755); if (!root_inode) goto fail_ino; @@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_iput; pipe = NULL; - root->d_op = &autofs4_sb_dentry_operations; root->d_fsdata = ino; /* Can this call block? */ @@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_dput; } + if (autofs_type_trigger(sbi->type)) + __managed_dentry_set_managed(root); + root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = autofs_type_trigger(sbi->type) ? - &autofs4_direct_root_inode_operations : - &autofs4_indirect_root_inode_operations; + root_inode->i_op = &autofs4_dir_inode_operations; /* Couldn't this be tested earlier? */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || @@ -383,16 +326,14 @@ fail_unlock: return -EINVAL; } -struct inode *autofs4_get_inode(struct super_block *sb, - struct autofs_info *inf) +struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) { struct inode *inode = new_inode(sb); if (inode == NULL) return NULL; - inf->inode = inode; - inode->i_mode = inf->mode; + inode->i_mode = mode; if (sb->s_root) { inode->i_uid = sb->s_root->d_inode->i_uid; inode->i_gid = sb->s_root->d_inode->i_gid; @@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_ino = get_next_ino(); - if (S_ISDIR(inf->mode)) { + if (S_ISDIR(mode)) { inode->i_nlink = 2; inode->i_op = &autofs4_dir_inode_operations; inode->i_fop = &autofs4_dir_operations; - } else if (S_ISLNK(inf->mode)) { - inode->i_size = inf->size; + } else if (S_ISLNK(mode)) { inode->i_op = &autofs4_symlink_inode_operations; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d34896cfb19..014e7aba3b0 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -23,6 +23,8 @@ #include "autofs_i.h" +DEFINE_SPINLOCK(autofs4_lock); + static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); @@ -33,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); #endif static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); -static void *autofs4_follow_link(struct dentry *, struct nameidata *); - -#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) -#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE) +static struct vfsmount *autofs4_d_automount(struct path *); +static int autofs4_d_manage(struct dentry *, bool, bool); +static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, @@ -58,7 +59,7 @@ const struct file_operations autofs4_dir_operations = { .llseek = dcache_dir_lseek, }; -const struct inode_operations autofs4_indirect_root_inode_operations = { +const struct inode_operations autofs4_dir_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, .symlink = autofs4_dir_symlink, @@ -66,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = { .rmdir = autofs4_dir_rmdir, }; -const struct inode_operations autofs4_direct_root_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, - .follow_link = autofs4_follow_link, -}; - -const struct inode_operations autofs4_dir_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .symlink = autofs4_dir_symlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, +const struct dentry_operations autofs4_dentry_operations = { + .d_automount = autofs4_d_automount, + .d_manage = autofs4_d_manage, + .d_release = autofs4_dentry_release, }; static void autofs4_add_active(struct dentry *dentry) @@ -114,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry) return; } -static unsigned int autofs4_need_mount(unsigned int flags) -{ - unsigned int res = 0; - if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS)) - res = 1; - return res; -} - static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; @@ -142,275 +125,41 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * autofs file system so just let the libfs routines handle * it. */ - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&dentry->d_lock); if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&autofs4_lock); return -ENOENT; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&autofs4_lock); out: return dcache_dir_open(inode, file); } -static int try_to_fill_dentry(struct dentry *dentry, int flags) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - int status; - - DPRINTK("dentry=%p %.*s ino=%p", - dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - - /* - * Wait for a pending mount, triggering one if there - * isn't one already - */ - if (dentry->d_inode == NULL) { - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_MOUNT); - - DPRINTK("mount done status=%d", status); - - /* Turn this into a real negative dentry? */ - if (status == -ENOENT) { - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - return status; - } else if (status) { - /* Return a negative dentry, but leave it "pending" */ - return status; - } - /* Trigger mount for path component or follow link */ - } else if (ino->flags & AUTOFS_INF_PENDING || - autofs4_need_mount(flags)) { - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); - - spin_lock(&sbi->fs_lock); - ino->flags |= AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); - - DPRINTK("mount done status=%d", status); - - if (status) { - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - return status; - } - } - - /* Initialize expiry counter after successful mount */ - ino->last_used = jiffies; - - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - - return 0; -} - -/* For autofs direct mounts the follow link triggers the mount */ -static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - int oz_mode = autofs4_oz_mode(sbi); - unsigned int lookup_type; - int status; - - DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d", - dentry, dentry->d_name.len, dentry->d_name.name, oz_mode, - nd->flags); - /* - * For an expire of a covered direct or offset mount we need - * to break out of follow_down() at the autofs mount trigger - * (d_mounted--), so we can see the expiring flag, and manage - * the blocking and following here until the expire is completed. - */ - if (oz_mode) { - spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { - spin_unlock(&sbi->fs_lock); - /* Follow down to our covering mount. */ - if (!follow_down(&nd->path)) - goto done; - goto follow; - } - spin_unlock(&sbi->fs_lock); - goto done; - } - - /* If an expire request is pending everyone must wait. */ - autofs4_expire_wait(dentry); - - /* We trigger a mount for almost all flags */ - lookup_type = autofs4_need_mount(nd->flags); - spin_lock(&sbi->fs_lock); - spin_lock(&dcache_lock); - if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) { - spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); - goto follow; - } - - /* - * If the dentry contains directories then it is an autofs - * multi-mount with no root mount offset. So don't try to - * mount it again. - */ - if (ino->flags & AUTOFS_INF_PENDING || - (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { - spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); - - status = try_to_fill_dentry(dentry, nd->flags); - if (status) - goto out_error; - - goto follow; - } - spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); -follow: - /* - * If there is no root mount it must be an autofs - * multi-mount with no root offset so we don't need - * to follow it. - */ - if (d_mountpoint(dentry)) { - if (!autofs4_follow_mount(&nd->path)) { - status = -ENOENT; - goto out_error; - } - } - -done: - return NULL; - -out_error: - path_put(&nd->path); - return ERR_PTR(status); -} - -/* - * Revalidate is called on every cache lookup. Some of those - * cache lookups may actually happen while the dentry is not - * yet completely filled in, and revalidate has to delay such - * lookups.. - */ -static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *dir = dentry->d_parent->d_inode; - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - int oz_mode = autofs4_oz_mode(sbi); - int flags = nd ? nd->flags : 0; - int status = 1; - - /* Pending dentry */ - spin_lock(&sbi->fs_lock); - if (autofs4_ispending(dentry)) { - /* The daemon never causes a mount to trigger */ - spin_unlock(&sbi->fs_lock); - - if (oz_mode) - return 1; - - /* - * If the directory has gone away due to an expire - * we have been called as ->d_revalidate() and so - * we need to return false and proceed to ->lookup(). - */ - if (autofs4_expire_wait(dentry) == -EAGAIN) - return 0; - - /* - * A zero status is success otherwise we have a - * negative error code. - */ - status = try_to_fill_dentry(dentry, flags); - if (status == 0) - return 1; - - return status; - } - spin_unlock(&sbi->fs_lock); - - /* Negative dentry.. invalidate if "old" */ - if (dentry->d_inode == NULL) - return 0; - - /* Check for a non-mountpoint directory with no contents */ - spin_lock(&dcache_lock); - if (S_ISDIR(dentry->d_inode->i_mode) && - !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - DPRINTK("dentry=%p %.*s, emptydir", - dentry, dentry->d_name.len, dentry->d_name.name); - spin_unlock(&dcache_lock); - - /* The daemon never causes a mount to trigger */ - if (oz_mode) - return 1; - - /* - * A zero status is success otherwise we have a - * negative error code. - */ - status = try_to_fill_dentry(dentry, flags); - if (status == 0) - return 1; - - return status; - } - spin_unlock(&dcache_lock); - - return 1; -} - -void autofs4_dentry_release(struct dentry *de) +static void autofs4_dentry_release(struct dentry *de) { - struct autofs_info *inf; + struct autofs_info *ino = autofs4_dentry_ino(de); + struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); DPRINTK("releasing %p", de); - inf = autofs4_dentry_ino(de); - de->d_fsdata = NULL; - - if (inf) { - struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); - - if (sbi) { - spin_lock(&sbi->lookup_lock); - if (!list_empty(&inf->active)) - list_del(&inf->active); - if (!list_empty(&inf->expiring)) - list_del(&inf->expiring); - spin_unlock(&sbi->lookup_lock); - } - - inf->dentry = NULL; - inf->inode = NULL; + if (!ino) + return; - autofs4_free_ino(inf); + if (sbi) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del(&ino->active); + if (!list_empty(&ino->expiring)) + list_del(&ino->expiring); + spin_unlock(&sbi->lookup_lock); } -} - -/* For dentries of directories in the root dir */ -static const struct dentry_operations autofs4_root_dentry_operations = { - .d_revalidate = autofs4_revalidate, - .d_release = autofs4_dentry_release, -}; -/* For other dentries */ -static const struct dentry_operations autofs4_dentry_operations = { - .d_revalidate = autofs4_revalidate, - .d_release = autofs4_dentry_release, -}; + autofs4_free_ino(ino); +} static struct dentry *autofs4_lookup_active(struct dentry *dentry) { @@ -422,7 +171,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; list_for_each(p, head) { @@ -436,7 +185,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (atomic_read(&active->d_count) == 0) + if (active->d_count == 0) goto next; qstr = &active->d_name; @@ -452,17 +201,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) goto next; if (d_unhashed(active)) { - dget(active); + dget_dlock(active); spin_unlock(&active->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return active; } next: spin_unlock(&active->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return NULL; } @@ -477,7 +226,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; list_for_each(p, head) { @@ -507,66 +256,261 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) goto next; if (d_unhashed(expiring)) { - dget(expiring); + dget_dlock(expiring); spin_unlock(&expiring->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return expiring; } next: spin_unlock(&expiring->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); + + return NULL; +} + +static int autofs4_mount_wait(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + if (ino->flags & AUTOFS_INF_PENDING) { + DPRINTK("waiting for mount name=%.*s", + dentry->d_name.len, dentry->d_name.name); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); + DPRINTK("mount wait done status=%d", status); + ino->last_used = jiffies; + return status; + } + return 0; +} + +static int do_expire_wait(struct dentry *dentry) +{ + struct dentry *expiring; + + expiring = autofs4_lookup_expiring(dentry); + if (!expiring) + return autofs4_expire_wait(dentry); + else { + /* + * If we are racing with expire the request might not + * be quite complete, but the directory has been removed + * so it must have been successful, just wait for it. + */ + autofs4_expire_wait(expiring); + autofs4_del_expiring(expiring); + dput(expiring); + } + return 0; +} + +static struct dentry *autofs4_mountpoint_changed(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + /* + * If this is an indirect mount the dentry could have gone away + * as a result of an expire and a new one created. + */ + if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + struct dentry *new = d_lookup(parent, &dentry->d_name); + if (!new) + return NULL; + dput(path->dentry); + path->dentry = new; + } + return path->dentry; +} + +static struct vfsmount *autofs4_d_automount(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + DPRINTK("dentry=%p %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + /* + * Someone may have manually umounted this or it was a submount + * that has gone away. + */ + spin_lock(&dentry->d_lock); + if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { + if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) && + (dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + __managed_dentry_set_transit(path->dentry); + } + spin_unlock(&dentry->d_lock); + + /* The daemon never triggers a mount. */ + if (autofs4_oz_mode(sbi)) + return NULL; + + /* + * If an expire request is pending everyone must wait. + * If the expire fails we're still mounted so continue + * the follow and return. A return of -EAGAIN (which only + * happens with indirect mounts) means the expire completed + * and the directory was removed, so just go ahead and try + * the mount. + */ + status = do_expire_wait(dentry); + if (status && status != -EAGAIN) + return NULL; + + /* Callback to the daemon to perform the mount or wait */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); + if (status) + return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + goto done; + } + + /* + * If the dentry is a symlink it's equivalent to a directory + * having d_mountpoint() true, so there's no need to call back + * to the daemon. + */ + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) + goto done; + if (!d_mountpoint(dentry)) { + /* + * It's possible that user space hasn't removed directories + * after umounting a rootless multi-mount, although it + * should. For v5 have_submounts() is sufficient to handle + * this because the leaves of the directory tree under the + * mount never trigger mounts themselves (they have an autofs + * trigger mount mounted on them). But v4 pseudo direct mounts + * do need the leaves to to trigger mounts. In this case we + * have no choice but to use the list_empty() check and + * require user space behave. + */ + if (sbi->version > 4) { + if (have_submounts(dentry)) + goto done; + } else { + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + goto done; + } + spin_unlock(&dentry->d_lock); + } + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); + if (status) + return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + } +done: + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path updated + * so turn this into a normal dentry so we don't continually + * call ->d_automount() and ->d_manage(). + */ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_transit(dentry); + /* + * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and + * symlinks as in all other cases the dentry will be covered by + * an actual mount so ->d_automount() won't be called during + * the follow. + */ + if ((!d_mountpoint(dentry) && + !list_empty(&dentry->d_subdirs)) || + (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + __managed_dentry_clear_automount(dentry); + spin_unlock(&dentry->d_lock); + } + spin_unlock(&sbi->fs_lock); + + /* Mount succeeded, check if we ended up with a new dentry */ + dentry = autofs4_mountpoint_changed(path); + if (!dentry) + return ERR_PTR(-ENOENT); return NULL; } +int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + DPRINTK("dentry=%p %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + /* The daemon never waits. */ + if (autofs4_oz_mode(sbi) || mounting_here) { + if (!d_mountpoint(dentry)) + return -EISDIR; + return 0; + } + + /* We need to sleep, so we need pathwalk to be in ref-mode */ + if (rcu_walk) + return -ECHILD; + + /* Wait for pending expires */ + do_expire_wait(dentry); + + /* + * This dentry may be under construction so wait on mount + * completion. + */ + return autofs4_mount_wait(dentry); +} + /* Lookups in the root directory */ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi; struct autofs_info *ino; - struct dentry *expiring, *active; - int oz_mode; + struct dentry *active; - DPRINTK("name = %.*s", - dentry->d_name.len, dentry->d_name.name); + DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); sbi = autofs4_sbi(dir->i_sb); - oz_mode = autofs4_oz_mode(sbi); DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", - current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs4_oz_mode(sbi)); active = autofs4_lookup_active(dentry); if (active) { - dentry = active; - ino = autofs4_dentry_ino(dentry); + return active; } else { /* - * Mark the dentry incomplete but don't hash it. We do this - * to serialize our inode creation operations (symlink and - * mkdir) which prevents deadlock during the callback to - * the daemon. Subsequent user space lookups for the same - * dentry are placed on the wait queue while the daemon - * itself is allowed passage unresticted so the create - * operation itself can then hash the dentry. Finally, - * we check for the hashed dentry and return the newly - * hashed dentry. + * A dentry that is not within the root can never trigger a + * mount operation, unless the directory already exists, so we + * can return fail immediately. The daemon however does need + * to create directories within the file system. */ - dentry->d_op = &autofs4_root_dentry_operations; + if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + return ERR_PTR(-ENOENT); - /* - * And we need to ensure that the same dentry is used for - * all following lookup calls until it is hashed so that - * the dentry flags are persistent throughout the request. - */ - ino = autofs4_init_ino(NULL, sbi, 0555); + /* Mark entries in the root as mount triggers */ + if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) + __managed_dentry_set_managed(dentry); + + ino = autofs4_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); @@ -577,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s d_instantiate(dentry, NULL); } - - if (!oz_mode) { - mutex_unlock(&dir->i_mutex); - expiring = autofs4_lookup_expiring(dentry); - if (expiring) { - /* - * If we are racing with expire the request might not - * be quite complete but the directory has been removed - * so it must have been successful, so just wait for it. - */ - autofs4_expire_wait(expiring); - autofs4_del_expiring(expiring); - dput(expiring); - } - - spin_lock(&sbi->fs_lock); - ino->flags |= AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - if (dentry->d_op && dentry->d_op->d_revalidate) - (dentry->d_op->d_revalidate)(dentry, nd); - mutex_lock(&dir->i_mutex); - } - - /* - * If we are still pending, check if we had to handle - * a signal. If so we can force a restart.. - */ - if (ino->flags & AUTOFS_INF_PENDING) { - /* See if we were interrupted */ - if (signal_pending(current)) { - sigset_t *sigset = ¤t->pending.signal; - if (sigismember (sigset, SIGKILL) || - sigismember (sigset, SIGQUIT) || - sigismember (sigset, SIGINT)) { - if (active) - dput(active); - return ERR_PTR(-ERESTARTNOINTR); - } - } - if (!oz_mode) { - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - } - } - - /* - * If this dentry is unhashed, then we shouldn't honour this - * lookup. Returning ENOENT here doesn't do the right thing - * for all system calls, but it should be OK for the operations - * we permit from an autofs. - */ - if (!oz_mode && d_unhashed(dentry)) { - /* - * A user space application can (and has done in the past) - * remove and re-create this directory during the callback. - * This can leave us with an unhashed dentry, but a - * successful mount! So we need to perform another - * cached lookup in case the dentry now exists. - */ - struct dentry *parent = dentry->d_parent; - struct dentry *new = d_lookup(parent, &dentry->d_name); - if (new != NULL) - dentry = new; - else - dentry = ERR_PTR(-ENOENT); - - if (active) - dput(active); - - return dentry; - } - - if (active) - return active; - return NULL; } @@ -664,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir, struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; + size_t size = strlen(symname); char *cp; DPRINTK("%s <- %.*s", symname, @@ -672,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir, if (!autofs4_oz_mode(sbi)) return -EACCES; - ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); - if (!ino) - return -ENOMEM; + BUG_ON(!ino); + + autofs4_clean_ino(ino); autofs4_del_active(dentry); - ino->size = strlen(symname); - cp = kmalloc(ino->size + 1, GFP_KERNEL); - if (!cp) { - if (!dentry->d_fsdata) - kfree(ino); + cp = kmalloc(size + 1, GFP_KERNEL); + if (!cp) return -ENOMEM; - } strcpy(cp, symname); - inode = autofs4_get_inode(dir->i_sb, ino); + inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); if (!dentry->d_fsdata) kfree(ino); return -ENOMEM; } + inode->i_private = cp; + inode->i_size = size; d_add(dentry, inode); - if (dir == dir->i_sb->s_root->d_inode) - dentry->d_op = &autofs4_root_dentry_operations; - else - dentry->d_op = &autofs4_dentry_operations; - - dentry->d_fsdata = ino; - ino->dentry = dget(dentry); + dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_inc(&p_ino->count); - ino->inode = inode; - ino->u.symlink = cp; dir->i_mtime = CURRENT_TIME; return 0; @@ -753,16 +612,68 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); autofs4_add_expiring(dentry); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return 0; } +/* + * Version 4 of autofs provides a pseudo direct mount implementation + * that relies on directories at the leaves of a directory tree under + * an indirect mount to trigger mounts. To allow for this we need to + * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves + * of the directory tree. There is no need to clear the automount flag + * following a mount or restore it after an expire because these mounts + * are always covered. However, it is neccessary to ensure that these + * flags are clear on non-empty directories to avoid unnecessary calls + * during path walks. + */ +static void autofs_set_leaf_automount_flags(struct dentry *dentry) +{ + struct dentry *parent; + + /* root and dentrys in the root are already handled */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_set_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + managed_dentry_clear_managed(parent); + return; +} + +static void autofs_clear_leaf_automount_flags(struct dentry *dentry) +{ + struct list_head *d_child; + struct dentry *parent; + + /* flags for dentrys in the root are handled elsewhere */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_clear_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + d_child = &dentry->d_u.d_child; + /* Set parent managed if it's becoming empty */ + if (d_child->next == &parent->d_subdirs && + d_child->prev == &parent->d_subdirs) + managed_dentry_set_managed(parent); + return; +} + static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); @@ -775,16 +686,23 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi)) return -EACCES; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); + spin_lock(&dentry->d_lock); if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&autofs4_lock); return -ENOTEMPTY; } - autofs4_add_expiring(dentry); - spin_lock(&dentry->d_lock); + __autofs4_add_expiring(dentry); + spin_unlock(&sbi->lookup_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); + + if (sbi->version < 5) + autofs_clear_leaf_automount_flags(dentry); if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); @@ -814,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) DPRINTK("dentry %p, creating %.*s", dentry, dentry->d_name.len, dentry->d_name.name); - ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); - if (!ino) - return -ENOMEM; + BUG_ON(!ino); + + autofs4_clean_ino(ino); autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, ino); - if (!inode) { - if (!dentry->d_fsdata) - kfree(ino); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + if (!inode) return -ENOMEM; - } d_add(dentry, inode); - if (dir == dir->i_sb->s_root->d_inode) - dentry->d_op = &autofs4_root_dentry_operations; - else - dentry->d_op = &autofs4_dentry_operations; + if (sbi->version < 5) + autofs_set_leaf_automount_flags(dentry); - dentry->d_fsdata = ino; - ino->dentry = dget(dentry); + dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_inc(&p_ino->count); - ino->inode = inode; inc_nlink(dir); dir->i_mtime = CURRENT_TIME; @@ -921,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) int is_autofs4_dentry(struct dentry *dentry) { return dentry && dentry->d_inode && - (dentry->d_op == &autofs4_root_dentry_operations || - dentry->d_op == &autofs4_dentry_operations) && + dentry->d_op == &autofs4_dentry_operations && dentry->d_fsdata != NULL; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index b4ea82934d2..f27c094a191 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -14,8 +14,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); - nd_set_link(nd, (char *)ino->u.symlink); + nd_set_link(nd, dentry->d_inode->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 2341375386f..56010056b2e 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; - char *buf = *name; + char *buf; char *p; - int len = 0; + int len; + unsigned seq; - spin_lock(&dcache_lock); +rename_retry: + buf = *name; + len = 0; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + spin_lock(&autofs4_lock); for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) len += tmp->d_name.len + 1; if (!len || --len > NAME_MAX) { - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return 0; } @@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, p -= tmp->d_name.len; strncpy(p, tmp->d_name.name, tmp->d_name.len); } - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return len; } @@ -296,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait, * completed while we waited on the mutex ... */ if (notify == NFY_MOUNT) { + struct dentry *new = NULL; + int valid = 1; + /* * If the dentry was successfully mounted while we slept * on the wait queue mutex we can return success. If it @@ -303,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait, * a multi-mount with no mount at it's base) we can * continue on and create a new request. */ + if (!IS_ROOT(dentry)) { + if (dentry->d_inode && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + new = d_lookup(parent, &dentry->d_name); + if (new) + dentry = new; + } + } if (have_submounts(dentry)) - return 0; + valid = 0; + + if (new) + dput(new); + return valid; } return 1; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index f024d8aadde..9ad2369d9e3 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct inode *inode, int mask) +static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + return -EIO; } diff --git a/fs/befs/endian.h b/fs/befs/endian.h index 6cb84d896d0..27223878ba9 100644 --- a/fs/befs/endian.h +++ b/fs/befs/endian.h @@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n) } static inline befs_data_stream -fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n) +fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n) { befs_data_stream data; int i; for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) - data.direct[i] = fsrun_to_cpu(sb, n.direct[i]); + data.direct[i] = fsrun_to_cpu(sb, n->direct[i]); - data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range); - data.indirect = fsrun_to_cpu(sb, n.indirect); - data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range); - data.double_indirect = fsrun_to_cpu(sb, n.double_indirect); + data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range); + data.indirect = fsrun_to_cpu(sb, n->indirect); + data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range); + data.double_indirect = fsrun_to_cpu(sb, n->double_indirect); data.max_double_indirect_range = fs64_to_cpu(sb, - n. + n-> max_double_indirect_range); - data.size = fs64_to_cpu(sb, n.size); + data.size = fs64_to_cpu(sb, n->size); return data; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index aa4e7c7ae3c..b1d0c794747 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb) return &bi->vfs_inode; } -static void -befs_destroy_inode(struct inode *inode) +static void befs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } +static void befs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, befs_i_callback); +} + static void init_once(void *foo) { struct befs_inode_info *bi = (struct befs_inode_info *) foo; @@ -384,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) int num_blks; befs_ino->i_data.ds = - fsds_to_cpu(sb, raw_inode->data.datastream); + fsds_to_cpu(sb, &raw_inode->data.datastream); num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds); inode->i_blocks = diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 76db6d7d49b..a8e37f81d09 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb) return &bi->vfs_inode; } -static void bfs_destroy_inode(struct inode *inode) +static void bfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } +static void bfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bfs_i_callback); +} + static void init_once(void *foo) { struct bfs_inode_info *bi = foo; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6884e198e0c..d5b640ba6cb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) static struct linux_binfmt elf_format = { - .module = THIS_MODULE, - .load_binary = load_elf_binary, - .load_shlib = load_elf_library, - .core_dump = elf_core_dump, - .min_coredump = ELF_EXEC_PAGESIZE, - .hasvdso = 1 + .module = THIS_MODULE, + .load_binary = load_elf_binary, + .load_shlib = load_elf_library, + .core_dump = elf_core_dump, + .min_coredump = ELF_EXEC_PAGESIZE, }; #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) @@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return 0; } -#ifndef elf_map - static unsigned long elf_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -#endif /* !elf_map */ - static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; @@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, goto out; retval = kernel_read(interpreter, interp_elf_ex->e_phoff, - (char *)elf_phdata,size); + (char *)elf_phdata, size); error = -EIO; if (retval != size) { if (retval < 0) @@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out; if (!elf_check_arch(&loc->elf_ex)) goto out; - if (!bprm->file->f_op||!bprm->file->f_op->mmap) + if (!bprm->file->f_op || !bprm->file->f_op->mmap) goto out; /* Now read in all of the header information */ @@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ - retval = set_brk (elf_bss + load_bias, - elf_brk + load_bias); + retval = set_brk(elf_bss + load_bias, + elf_brk + load_bias); if (retval) { send_sig(SIGKILL, current, 0); goto out_free_dentry; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 4d0ff5ee27b..e49cce234c6 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -782,7 +782,12 @@ void __init bio_integrity_init(void) { unsigned int i; - kintegrityd_wq = create_workqueue("kintegrityd"); + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 4230252fd68..333a7bb4cb9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -409,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void bdev_destroy_inode(struct inode *inode) +static void bdev_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } +static void bdev_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bdev_i_callback); +} + static void init_once(void *foo) { struct bdev_inode *ei = (struct bdev_inode *) foo; @@ -426,7 +433,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_list); + INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -466,7 +473,7 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); } static struct file_system_type bd_type = { @@ -662,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, else if (bdev->bd_contains == bdev) return true; /* is a whole device which isn't held */ - else if (whole->bd_holder == bd_claim) + else if (whole->bd_holder == bd_may_claim) return true; /* is a partition of a device that is being partitioned */ else if (whole->bd_holder != NULL) return false; /* is a partition of a held device */ @@ -774,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, } } -/* releases bdev_lock */ -static void __bd_abort_claiming(struct block_device *whole, void *holder) -{ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); - - spin_unlock(&bdev_lock); - bdput(whole); -} - -/** - * bd_abort_claiming - abort claiming a block device - * @whole: whole block device returned by bd_start_claiming() - * @holder: holder trying to claim @bdev - * - * Abort a claiming block started by bd_start_claiming(). Note that - * @whole is not the block device to be claimed but the whole device - * returned by bd_start_claiming(). - * - * CONTEXT: - * Grabs and releases bdev_lock. - */ -static void bd_abort_claiming(struct block_device *whole, void *holder) -{ - spin_lock(&bdev_lock); - __bd_abort_claiming(whole, holder); /* releases bdev_lock */ -} - -/* increment holders when we have a legitimate claim. requires bdev_lock */ -static void __bd_claim(struct block_device *bdev, struct block_device *whole, - void *holder) -{ - /* note that for a whole device bd_holders - * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; -} - -/** - * bd_finish_claiming - finish claiming a block device - * @bdev: block device of interest (passed to bd_start_claiming()) - * @whole: whole block device returned by bd_start_claiming() - * @holder: holder trying to claim @bdev - * - * Finish a claiming block started by bd_start_claiming(). - * - * CONTEXT: - * Grabs and releases bdev_lock. - */ -static void bd_finish_claiming(struct block_device *bdev, - struct block_device *whole, void *holder) -{ - spin_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, whole, holder)); - __bd_claim(bdev, whole, holder); - __bd_abort_claiming(whole, holder); /* not actually an abort */ -} +#ifdef CONFIG_SYSFS +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; -/** - * bd_claim - claim a block device - * @bdev: block device to claim - * @holder: holder trying to claim @bdev - * - * Try to claim @bdev which must have been opened successfully. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 if successful, -EBUSY if @bdev is already claimed. - */ -int bd_claim(struct block_device *bdev, void *holder) +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) { - struct block_device *whole = bdev->bd_contains; - int res; + struct bd_holder_disk *holder; - might_sleep(); - - spin_lock(&bdev_lock); - res = bd_prepare_to_claim(bdev, whole, holder); - if (res == 0) - __bd_claim(bdev, whole, holder); - spin_unlock(&bdev_lock); - - return res; -} -EXPORT_SYMBOL(bd_claim); - -void bd_release(struct block_device *bdev) -{ - spin_lock(&bdev_lock); - if (!--bdev->bd_contains->bd_holders) - bdev->bd_contains->bd_holder = NULL; - if (!--bdev->bd_holders) - bdev->bd_holder = NULL; - spin_unlock(&bdev_lock); + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; } -EXPORT_SYMBOL(bd_release); - -#ifdef CONFIG_SYSFS -/* - * Functions for bd_claim_by_kobject / bd_release_from_kobject - * - * If a kobject is passed to bd_claim_by_kobject() - * and the kobject has a parent directory, - * following symlinks are created: - * o from the kobject to the claimed bdev - * o from "holders" directory of the bdev to the parent of the kobject - * bd_release_from_kobject() removes these symlinks. - * - * Example: - * If /dev/dm-0 maps to /dev/sda, kobject corresponding to - * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - */ - static int add_symlink(struct kobject *from, struct kobject *to) { - if (!from || !to) - return 0; return sysfs_create_link(from, to, kobject_name(to)); } static void del_symlink(struct kobject *from, struct kobject *to) { - if (!from || !to) - return; sysfs_remove_link(from, kobject_name(to)); } -/* - * 'struct bd_holder' contains pointers to kobjects symlinked by - * bd_claim_by_kobject. - * It's connected to bd_holder_list which is protected by bdev->bd_sem. - */ -struct bd_holder { - struct list_head list; /* chain of holders of the bdev */ - int count; /* references from the holder */ - struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ - struct kobject *hdev; /* e.g. "/block/dm-0" */ - struct kobject *hdir; /* e.g. "/block/sda/holders" */ - struct kobject *sdev; /* e.g. "/block/sda" */ -}; - -/* - * Get references of related kobjects at once. - * Returns 1 on success. 0 on failure. - * - * Should call bd_holder_release_dirs() after successful use. - */ -static int bd_holder_grab_dirs(struct block_device *bdev, - struct bd_holder *bo) -{ - if (!bdev || !bo) - return 0; - - bo->sdir = kobject_get(bo->sdir); - if (!bo->sdir) - return 0; - - bo->hdev = kobject_get(bo->sdir->parent); - if (!bo->hdev) - goto fail_put_sdir; - - bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); - if (!bo->sdev) - goto fail_put_hdev; - - bo->hdir = kobject_get(bdev->bd_part->holder_dir); - if (!bo->hdir) - goto fail_put_sdev; - - return 1; - -fail_put_sdev: - kobject_put(bo->sdev); -fail_put_hdev: - kobject_put(bo->hdev); -fail_put_sdir: - kobject_put(bo->sdir); - - return 0; -} - -/* Put references of related kobjects at once. */ -static void bd_holder_release_dirs(struct bd_holder *bo) -{ - kobject_put(bo->hdir); - kobject_put(bo->sdev); - kobject_put(bo->hdev); - kobject_put(bo->sdir); -} - -static struct bd_holder *alloc_bd_holder(struct kobject *kobj) -{ - struct bd_holder *bo; - - bo = kzalloc(sizeof(*bo), GFP_KERNEL); - if (!bo) - return NULL; - - bo->count = 1; - bo->sdir = kobj; - - return bo; -} - -static void free_bd_holder(struct bd_holder *bo) -{ - kfree(bo); -} - /** - * find_bd_holder - find matching struct bd_holder from the block device + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk * - * @bdev: struct block device to be searched - * @bo: target struct bd_holder + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. * - * Returns matching entry with @bo in @bdev->bd_holder_list. - * If found, increment the reference count and return the pointer. - * If not found, returns NULL. - */ -static struct bd_holder *find_bd_holder(struct block_device *bdev, - struct bd_holder *bo) -{ - struct bd_holder *tmp; - - list_for_each_entry(tmp, &bdev->bd_holder_list, list) - if (tmp->sdir == bo->sdir) { - tmp->count++; - return tmp; - } - - return NULL; -} - -/** - * add_bd_holder - create sysfs symlinks for bd_claim() relationship + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 * - * @bdev: block device to be bd_claimed - * @bo: preallocated and initialized by alloc_bd_holder() + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. * - * Add @bo to @bdev->bd_holder_list, create symlinks. + * CONTEXT: + * Might sleep. * - * Returns 0 if symlinks are created. - * Returns -ve if something fails. + * RETURNS: + * 0 on success, -errno on failure. */ -static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { - int err; + struct bd_holder_disk *holder; + int ret = 0; - if (!bo) - return -EINVAL; + mutex_lock(&bdev->bd_mutex); - if (!bd_holder_grab_dirs(bdev, bo)) - return -EBUSY; + WARN_ON_ONCE(!bdev->bd_holder); - err = add_symlink(bo->sdir, bo->sdev); - if (err) - return err; + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + goto out_unlock; - err = add_symlink(bo->hdir, bo->hdev); - if (err) { - del_symlink(bo->sdir, bo->sdev); - return err; + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; } - list_add_tail(&bo->list, &bdev->bd_holder_list); - return 0; -} - -/** - * del_bd_holder - delete sysfs symlinks for bd_claim() relationship - * - * @bdev: block device to be bd_claimed - * @kobj: holder's kobject - * - * If there is matching entry with @kobj in @bdev->bd_holder_list - * and no other bd_claim() from the same kobject, - * remove the struct bd_holder from the list, delete symlinks for it. - * - * Returns a pointer to the struct bd_holder when it's removed from the list - * and ready to be freed. - * Returns NULL if matching claim isn't found or there is other bd_claim() - * by the same kobject. - */ -static struct bd_holder *del_bd_holder(struct block_device *bdev, - struct kobject *kobj) -{ - struct bd_holder *bo; - - list_for_each_entry(bo, &bdev->bd_holder_list, list) { - if (bo->sdir == kobj) { - bo->count--; - BUG_ON(bo->count < 0); - if (!bo->count) { - list_del(&bo->list); - del_symlink(bo->sdir, bo->sdev); - del_symlink(bo->hdir, bo->hdev); - bd_holder_release_dirs(bo); - return bo; - } - break; - } + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; } - return NULL; -} + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; -/** - * bd_claim_by_kobject - bd_claim() with additional kobject signature - * - * @bdev: block device to be claimed - * @holder: holder's signature - * @kobj: holder's kobject - * - * Do bd_claim() and if it succeeds, create sysfs symlinks between - * the bdev and the holder's kobject. - * Use bd_release_from_kobject() when relesing the claimed bdev. - * - * Returns 0 on success. (same as bd_claim()) - * Returns errno on failure. - */ -static int bd_claim_by_kobject(struct block_device *bdev, void *holder, - struct kobject *kobj) -{ - int err; - struct bd_holder *bo, *found; + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + goto out_free; - if (!kobj) - return -EINVAL; + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; - bo = alloc_bd_holder(kobj); - if (!bo) - return -ENOMEM; + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; - mutex_lock(&bdev->bd_mutex); - - err = bd_claim(bdev, holder); - if (err) - goto fail; - - found = find_bd_holder(bdev, bo); - if (found) - goto fail; - - err = add_bd_holder(bdev, bo); - if (err) - bd_release(bdev); - else - bo = NULL; -fail: +out_del: + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +out_free: + kfree(holder); +out_unlock: mutex_unlock(&bdev->bd_mutex); - free_bd_holder(bo); - return err; + return ret; } +EXPORT_SYMBOL_GPL(bd_link_disk_holder); /** - * bd_release_from_kobject - bd_release() with additional kobject signature + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk * - * @bdev: block device to be released - * @kobj: holder's kobject + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. * - * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). + * CONTEXT: + * Might sleep. */ -static void bd_release_from_kobject(struct block_device *bdev, - struct kobject *kobj) +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { - if (!kobj) - return; + struct bd_holder_disk *holder; mutex_lock(&bdev->bd_mutex); - bd_release(bdev); - free_bd_holder(del_bd_holder(bdev, kobj)); - mutex_unlock(&bdev->bd_mutex); -} -/** - * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() - * - * @bdev: block device to be claimed - * @holder: holder's signature - * @disk: holder's gendisk - * - * Call bd_claim_by_kobject() with getting @disk->slave_dir. - */ -int bd_claim_by_disk(struct block_device *bdev, void *holder, - struct gendisk *disk) -{ - return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); -} -EXPORT_SYMBOL_GPL(bd_claim_by_disk); + holder = bd_find_holder_disk(bdev, disk); -/** - * bd_release_from_disk - wrapper function for bd_release_from_kobject() - * - * @bdev: block device to be claimed - * @disk: holder's gendisk - * - * Call bd_release_from_kobject() and put @disk->slave_dir. - */ -void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) -{ - bd_release_from_kobject(bdev, disk->slave_dir); - kobject_put(disk->slave_dir); -} -EXPORT_SYMBOL_GPL(bd_release_from_disk); -#endif + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, + &disk_to_dev(disk)->kobj); + list_del_init(&holder->list); + kfree(holder); + } -/* - * Tries to open block device by device number. Use it ONLY if you - * really do not have anything better - i.e. when you are behind a - * truly sucky interface and all you are given is a device number. _Never_ - * to be used for internal purposes. If you ever need it - reconsider - * your API. - */ -struct block_device *open_by_devnum(dev_t dev, fmode_t mode) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, mode); - return err ? ERR_PTR(err) : bdev; + mutex_unlock(&bdev->bd_mutex); } - -EXPORT_SYMBOL(open_by_devnum); +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); +#endif /** * flush_disk - invalidates all buffer-cache entries on a disk @@ -1302,10 +1012,11 @@ int check_disk_change(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; const struct block_device_operations *bdops = disk->fops; + unsigned int events; - if (!bdops->media_changed) - return 0; - if (!bdops->media_changed(bdev->bd_disk)) + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; flush_disk(bdev); @@ -1468,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) return ret; } -int blkdev_get(struct block_device *bdev, fmode_t mode) +/** + * blkdev_get - open a block device + * @bdev: block_device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is + * open with exclusive access. Specifying %FMODE_EXCL with %NULL + * @holder is invalid. Exclusive opens may nest for the same @holder. + * + * On success, the reference count of @bdev is unchanged. On failure, + * @bdev is put. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - return __blkdev_get(bdev, mode, 0); + struct block_device *whole = NULL; + int res; + + WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + + if ((mode & FMODE_EXCL) && holder) { + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + + res = __blkdev_get(bdev, mode, 0); + + /* __blkdev_get() may alter read only status, check it afterwards */ + if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { + __blkdev_put(bdev, mode, 0); + res = -EACCES; + } + + if (whole) { + /* finish claiming */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + if (!res) { + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders + * will be incremented twice, and bd_holder + * will be set to bd_may_claim before being + * set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + } + + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + + /* + * Block event polling for write claims. Any write + * holder makes the write_holder state stick until all + * are released. This is good enough and tracking + * individual writeable reference is too fragile given + * the way @mode is used in blkdev_get/put(). + */ + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + bdev->bd_write_holder = true; + disk_block_events(bdev->bd_disk); + } + + mutex_unlock(&bdev->bd_mutex); + bdput(whole); + } + + return res; } EXPORT_SYMBOL(blkdev_get); +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by the device file at @path. @mode + * and @holder are identical to blkdev_get(). + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder) +{ + struct block_device *bdev; + int err; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return bdev; + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by device number @dev. @mode and + * @holder are identical to blkdev_get(). + * + * Use it ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a + * device number. _Never_ to be used for internal purposes. If you + * ever need it - reconsider your API. + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ + struct block_device *bdev; + int err; + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_dev); + static int blkdev_open(struct inode * inode, struct file * filp) { - struct block_device *whole = NULL; struct block_device *bdev; - int res; /* * Preserve backwards compatibility and allow large file access @@ -1499,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; - if (filp->f_mode & FMODE_EXCL) { - whole = bd_start_claiming(bdev, filp); - if (IS_ERR(whole)) { - bdput(bdev); - return PTR_ERR(whole); - } - } - filp->f_mapping = bdev->bd_inode->i_mapping; - res = blkdev_get(bdev, filp->f_mode); - - if (whole) { - if (res == 0) - bd_finish_claiming(bdev, whole, filp); - else - bd_abort_claiming(whole, filp); - } - - return res; + return blkdev_get(bdev, filp->f_mode, filp); } static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) @@ -1532,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_part_count--; if (!--bdev->bd_openers) { + WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); } @@ -1562,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + if (mode & FMODE_EXCL) { + bool bdev_free; + + /* + * Release a claim on the device. The holder fields + * are protected with bdev_lock. bd_mutex is to + * synchronize disk_holder unlinking. + */ + mutex_lock(&bdev->bd_mutex); + spin_lock(&bdev_lock); + + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + + /* bd_contains might point to self, check in a separate step */ + if ((bdev_free = !bdev->bd_holders)) + bdev->bd_holder = NULL; + if (!bdev->bd_contains->bd_holders) + bdev->bd_contains->bd_holder = NULL; + + spin_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and + * unblock evpoll if it was a write holder. + */ + if (bdev_free) { + if (bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } else + disk_check_events(bdev->bd_disk); + } + + mutex_unlock(&bdev->bd_mutex); + } else + disk_check_events(bdev->bd_disk); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); @@ -1569,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); - if (bdev->bd_holder == filp) - bd_release(bdev); + return blkdev_put(bdev, filp->f_mode); } @@ -1715,67 +1601,6 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -/** - * open_bdev_exclusive - open a block device by name and set it up for use - * - * @path: special file representing the block device - * @mode: FMODE_... combination to pass be used - * @holder: owner for exclusion - * - * Open the blockdevice described by the special file at @path, claim it - * for the @holder. - */ -struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) -{ - struct block_device *bdev, *whole; - int error; - - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; - - whole = bd_start_claiming(bdev, holder); - if (IS_ERR(whole)) { - bdput(bdev); - return whole; - } - - error = blkdev_get(bdev, mode); - if (error) - goto out_abort_claiming; - - error = -EACCES; - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto out_blkdev_put; - - bd_finish_claiming(bdev, whole, holder); - return bdev; - -out_blkdev_put: - blkdev_put(bdev, mode); -out_abort_claiming: - bd_abort_claiming(whole, holder); - return ERR_PTR(error); -} - -EXPORT_SYMBOL(open_bdev_exclusive); - -/** - * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() - * - * @bdev: blockdevice to close - * @mode: mode, must match that used to open. - * - * This is the counterpart to open_bdev_exclusive(). - */ -void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) -{ - bd_release(bdev); - blkdev_put(bdev, mode); -} - -EXPORT_SYMBOL(close_bdev_exclusive); - int __invalidate_device(struct block_device *bdev) { struct super_block *sb = get_super(bdev); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c020e57..ecb9fd3be14 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,6 +4,8 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a35eb36b32f..31610ea73ae 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o \ + export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 2222d161c7b..15b5ca2a260 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) + if (IS_ERR(acl)) { + kfree(value); return acl; + } set_cached_acl(inode, type, acl); } kfree(value); @@ -185,18 +187,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, return ret; } -int btrfs_check_acl(struct inode *inode, int mask) +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl; int error = -EAGAIN; - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + error = -ECHILD; - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); + } else { + struct posix_acl *acl; + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } } return error; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6ad63f17eca..ccc991c542d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -157,7 +157,7 @@ struct btrfs_inode { /* * always compress this one file */ - unsigned force_compress:1; + unsigned force_compress:4; struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b50bc4bd5c5..f745287fbf2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -62,6 +62,9 @@ struct compressed_bio { /* number of bytes on disk */ unsigned long compressed_len; + /* the compression algorithm for this bio */ + int compress_type; + /* number of compressed pages in the array */ unsigned long nr_pages; @@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, - cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, - cb->compressed_len); + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); csum_failed: if (ret) cb->errors = 1; @@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / @@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_put(comp_bio); return 0; } + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +struct btrfs_compress_op *btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +int __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } + return 0; +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add_tail(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -1; + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void __exit btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*page_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*page_index)++; + if (*page_index >= vcnt) + return 0; + + page_out = bvec[*page_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 421f5b4aa71..51000174b9d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,24 +19,27 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen); -void btrfs_zlib_exit(void); +int btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset); + int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, @@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); + +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern struct btrfs_compress_op btrfs_zlib_compress; +extern struct btrfs_compress_op btrfs_lzo_compress; + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9ac17159925..b5baff0dccf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { + if (!p) + return; btrfs_release_path(NULL, p); kmem_cache_free(btrfs_path_cachep, p); } @@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + btrfs_tree_lock(left); btrfs_set_lock_blocking(left); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af52f6d7a4d..2c98b3af605 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/kobject.h> #include <asm/kmap_types.h> #include "extent_io.h" #include "extent_map.h" @@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FSID_SIZE 16 #define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) #define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ + +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + #define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) #define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) @@ -398,13 +407,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* * A leaf is full of items. offset and size tell us where to find @@ -551,9 +562,11 @@ struct btrfs_timespec { } __attribute__ ((__packed__)); enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LAST = 2, + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, }; struct btrfs_inode_item { @@ -597,6 +610,8 @@ struct btrfs_dir_item { u8 type; } __attribute__ ((__packed__)); +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -895,7 +910,8 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; u64 open_ioctl_trans; - unsigned long mount_opt; + unsigned long mount_opt:20; + unsigned long compress_type:4; u64 max_inline; u64 alloc_start; struct btrfs_transaction *running_transaction; @@ -1050,6 +1066,9 @@ struct btrfs_fs_info { unsigned metadata_ratio; void *bdev_holder; + + /* filesystem state */ + u64 fs_state; }; /* @@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; +} + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); @@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2541,10 +2572,18 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno); + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ +} while (0) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask); +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags); #else #define btrfs_check_acl NULL #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 51d2e4de34e..b531c36455d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,20 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static int btrfs_destroy_ordered_operations(struct btrfs_root *root); +static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + WARN_ON(1); + goto out; + } ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); @@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + ret = -EIO; + goto out; + } found_start = btrfs_header_bytenr(eb); if (found_start != start) { @@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } btrfs_free_path(path); if (ret) { + kfree(root); if (ret > 0) ret = -ENOENT; return ERR_PTR(ret); @@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info, BTRFS_ROOT_TREE_OBJECTID); bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) + if (!bh) { + err = -EINVAL; goto fail_iput; + } memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); memcpy(&fs_info->super_for_commit, &fs_info->super_copy, @@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!btrfs_super_root(disk_super)) goto fail_iput; + /* check FS state, whether FS is broken. */ + fs_info->fs_state |= btrfs_super_flags(disk_super); + + btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; @@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, } features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - btrfs_set_super_incompat_flags(disk_super, features); - } + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; @@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_set_opt(fs_info->mount_opt, SSD); } - if (btrfs_super_log_root(disk_super) != 0) { + /* do not make disk changes in broken FS */ + if (btrfs_super_log_root(disk_super) != 0 && + !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root) smp_mb(); btrfs_put_block_group_cache(fs_info); + + /* + * Here come 2 situations when btrfs is broken to flip readonly: + * + * 1. when btrfs flips readonly somewhere else before + * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, + * and btrfs will skip to write sb directly to keep + * ERROR state on disk. + * + * 2. when btrfs flips readonly just in btrfs_commit_super, + * and in such case, btrfs cannnot write sb via btrfs_commit_super, + * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, + * btrfs will cleanup all FS resources first and write sb then. + */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + ret = btrfs_error_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } @@ -2619,6 +2671,352 @@ out: return 0; } +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + if (read_only) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + printk(KERN_WARNING "warning: mount fs with errors, " + "running btrfsck is recommended\n"); +} + +int btrfs_error_commit_super(struct btrfs_root *root) +{ + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); + + ret = write_ctree_super(NULL, root, 0); + + return ret; +} + +static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_operations, &splice); + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + list_del_init(&btrfs_inode->ordered_operations); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct list_head splice; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + ordered = list_entry(splice.next, struct btrfs_ordered_extent, + root_extent_list); + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* the inode may be getting freed (in sys_unlink path). */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + if (inode) + iput(inode); + + atomic_set(&ordered->refs, 1); + btrfs_put_ordered_extent(ordered); + + spin_lock(&root->fs_info->ordered_extent_lock); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (delayed_refs->num_entries == 0) { + printk(KERN_INFO "delayed_refs has NO entry\n"); + return ret; + } + + node = rb_first(&delayed_refs->root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + atomic_set(&ref->refs, 1); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + mutex_lock(&head->mutex); + kfree(head->extent_op); + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + list_del_init(&head->cluster); + mutex_unlock(&head->mutex); + } + + spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref(ref); + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + + return ret; +} + +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +{ + struct btrfs_pending_snapshot *snapshot; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&t->pending_snapshots, &splice); + + while (!list_empty(&splice)) { + snapshot = list_entry(splice.next, + struct btrfs_pending_snapshot, + list); + + list_del_init(&snapshot->list); + + kfree(snapshot); + } + + return 0; +} + +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + + spin_lock(&root->fs_info->delalloc_lock); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->delalloc_lock); + + return 0; +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) +{ + int ret; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + u64 start = 0; + u64 end; + u64 offset; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + offset = page_offset(page); + + spin_lock(&dirty_pages->buffer_lock); + eb = radix_tree_lookup( + &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, + offset >> PAGE_CACHE_SHIFT); + spin_unlock(&dirty_pages->buffer_lock); + if (eb) { + ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags); + atomic_set(&eb->refs, 1); + } + if (PageWriteback(page)) + end_page_writeback(page); + + lock_page(page); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&page->mapping->tree_lock); + } + + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + } + } + + return ret; +} + +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + unpin = pinned_extents; + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + /* opt_discard */ + ret = btrfs_error_discard_extent(root, start, end + 1 - start); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + return 0; +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + LIST_HEAD(list); + + WARN_ON(1); + + mutex_lock(&root->fs_info->trans_mutex); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + list_splice_init(&root->fs_info->trans_list, &list); + while (!list_empty(&list)) { + t = list_entry(list.next, struct btrfs_transaction, list); + if (!t) + break; + + btrfs_destroy_ordered_operations(root); + + btrfs_destroy_ordered_extents(root); + + btrfs_destroy_delayed_refs(t, root); + + btrfs_block_rsv_release(root, + &root->fs_info->trans_block_rsv, + t->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + t->in_commit = 1; + t->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + t->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + t->commit_done = 1; + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + + btrfs_destroy_pending_snapshots(t); + + btrfs_destroy_delalloc_inodes(root); + + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + + btrfs_destroy_marked_extents(root, &t->dirty_pages, + EXTENT_DIRTY); + + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + t->use_count = 0; + list_del_init(&t->list); + memset(t, 0, sizeof(*t)); + kmem_cache_free(btrfs_transaction_cachep, t); + } + + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 88e825a0bf2..07b20dc2fd9 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); +int btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 659f532d26a..9786963b07e 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; struct btrfs_root *root; - struct dentry *dentry; struct inode *inode; struct btrfs_key key; int index; @@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); } - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) - dentry->d_op = &btrfs_dentry_operations; - return dentry; + return d_obtain_alias(inode); fail: srcu_read_unlock(&fs_info->subvol_srcu, index); return ERR_PTR(err); @@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; - struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -223,10 +218,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); - if (!IS_ERR(dentry)) - dentry->d_op = &btrfs_dentry_operations; - return dentry; + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); fail: btrfs_free_path(path); return ERR_PTR(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 227e5815d83..b55269340ce 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) return btrfs_reduce_alloc_profile(root, flags); } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; @@ -3161,8 +3161,12 @@ alloc: bytes + 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); - if (ret < 0) - return ret; + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else + goto commit_trans; + } if (!data_sinfo) { btrfs_set_inode_space_info(root, inode); @@ -3173,6 +3177,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ +commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); @@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; } - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return -ENOSPC; } @@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache) if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; sinfo->bytes_reserved += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } + spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); return ret; @@ -8012,6 +8013,62 @@ out: return ret; } +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +{ + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; + + list_for_each_entry(block_group, groups_list, list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; + + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; + + spin_unlock(&block_group->lock); + } + + return free_bytes; +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + int i; + u64 free_bytes = 0; + + spin_lock(&sinfo->lock); + + for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + if (!list_empty(&sinfo->block_groups[i])) + free_bytes += __btrfs_get_ro_block_group_free_space( + &sinfo->block_groups[i]); + + spin_unlock(&sinfo->lock); + + return free_bytes; +} + int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 min_free = btrfs_block_group_used(&block_group->item); - u64 dev_offset, max_avail; + u64 dev_offset; /* * check to make sure we can actually find a chunk with enough @@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free) { ret = find_free_dev_extent(NULL, device, min_free, - &dev_offset, &max_avail); + &dev_offset, NULL); if (!ret) break; ret = -1; @@ -8584,3 +8641,14 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes); +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e86b9f3650..2e993cf1766 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { this_bio_flag = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); @@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); + if (eb == NULL) + return NULL; eb->start = start; eb->len = len; spin_lock_init(&eb->lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4183c8178f0..7083cfafd06 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,8 +20,12 @@ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) -/* flags for bio submission */ +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 @@ -135,6 +139,17 @@ struct extent_buffer { wait_queue_head_t lock_wq; }; +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} + +static inline int extent_compress_type(unsigned long bio_flags) +{ + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; +} + struct extent_map_tree; static inline struct extent_state *extent_state_next(struct extent_state *state) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 23cb8da3ff6..b0e1fce1253 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> +#include "ctree.h" #include "extent_map.h" @@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask) return em; em->in_tree = 0; em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; atomic_set(&em->refs, 1); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ab6d74b6e64..28b44dbd1e3 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,7 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - int in_tree; + unsigned int in_tree:1; + unsigned int compress_type:4; }; struct extent_map_tree { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66836d85763..c800d58f301 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> @@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); free_extent_map(split); @@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->len = em->start + em->len - (start + len); split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; if (compressed) { split->block_len = em->block_len; @@ -890,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err) goto out; + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + err = -EROFS; + goto out; + } + file_update_time(file); BTRFS_I(inode)->sequence++; @@ -1237,6 +1251,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + if (ret < 0) { + free_extent_map(em); + break; + } + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); + + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -1248,6 +1373,7 @@ const struct file_operations btrfs_file_operations = { .open = generic_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 72f31ecb5c9..160b55b3e13 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; size_t datasize; unsigned long offset; - int use_compress = 0; + int compress_type = BTRFS_COMPRESS_NONE; if (compressed_size && compressed_pages) { - use_compress = 1; + compress_type = root->fs_info->compress_type; cur_size = compressed_size; } @@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); - if (use_compress) { + if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { @@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, - BTRFS_COMPRESS_ZLIB); + compress_type); } else { page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); @@ -263,6 +263,7 @@ struct async_extent { u64 compressed_size; struct page **pages; unsigned long nr_pages; + int compress_type; struct list_head list; }; @@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, + int compress_type) { struct async_extent *async_extent; @@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow, async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode, unsigned long max_uncompressed = 128 * 1024; int i; int will_compress; + int compress_type = root->fs_info->compress_type; actual_end = min_t(u64, isize, end + 1); again: @@ -381,12 +385,16 @@ again: WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - ret = btrfs_zlib_compress_pages(inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, - &total_in, - &total_compressed, - max_compressed); + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; + + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); if (!ret) { unsigned long offset = total_compressed & @@ -493,7 +501,8 @@ again: * and will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret); + total_compressed, pages, nr_pages_ret, + compress_type); if (start + num_bytes < end) { start += num_bytes; @@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; } @@ -640,6 +650,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -656,11 +667,13 @@ retry: async_extent->ram_size - 1, 0); } - ret = btrfs_add_ordered_extent(inode, async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - BTRFS_ORDERED_COMPRESSED); + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); BUG_ON(ret); /* @@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - int compressed = 0; + int compress_type = 0; int ret; bool nolock = false; @@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) - compressed = 1; + compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - BUG_ON(compressed); + BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + @@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - compressed, 0, 0, + compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, @@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); } failrec->logical = logical; free_extent_map(em); @@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; int err; + if (btrfs_root_readonly(root)) + return -EROFS; + err = inode_change_ok(inode, attr); if (err) return err; @@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) int index; int ret; - dentry->d_op = &btrfs_dentry_operations; - if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -4127,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } -static int btrfs_dentry_delete(struct dentry *dentry) +static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; @@ -4930,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path, size_t max_size; unsigned long inline_size; unsigned long ptr; + int compress_type; WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, btrfs_item_nr(leaf, path->slots[0])); @@ -4941,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); - ret = btrfs_zlib_decompress(tmp, page, extent_offset, - inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); if (ret) { char *kaddr = kmap_atomic(page, KM_USER0); unsigned long copy_size = min_t(u64, @@ -4984,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compressed; + int compress_type; again: read_lock(&em_tree->lock); @@ -5043,7 +5062,7 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compressed = btrfs_file_extent_compression(leaf, item); + compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -5089,8 +5108,9 @@ again: em->block_start = EXTENT_MAP_HOLE; goto insert; } - if (compressed) { + if (compress_type != BTRFS_COMPRESS_NONE) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; em->block_start = bytenr; em->block_len = btrfs_file_extent_disk_num_bytes(leaf, item); @@ -5124,12 +5144,14 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_start = EXTENT_MAP_INLINE; - if (compressed) + if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) == - BTRFS_COMPRESS_ZLIB) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); @@ -6479,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; - ei->force_compress = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree, GFP_NOFS); @@ -6495,6 +6517,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return inode; } +static void btrfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; @@ -6564,7 +6593,7 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + call_rcu(&inode->i_rcu, btrfs_i_callback); } int btrfs_drop_inode(struct inode *inode) @@ -7093,122 +7122,20 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static long btrfs_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; - - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; - - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); - if (ret) - goto out; - } - - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; - - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } - - cur_offset = alloc_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); - last_byte = min(extent_map_end(em), alloc_end); - last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - if (ret < 0) { - free_extent_map(em); - break; - } - } - free_extent_map(em); - - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - static int btrfs_set_page_dirty(struct page *page) { return __set_page_dirty_nobuffers(page); } -static int btrfs_permission(struct inode *inode, int mask) +static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) + return -EROFS; if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; - return generic_permission(inode, mask, btrfs_check_acl); + return generic_permission(inode, mask, flags, btrfs_check_acl); } static const struct inode_operations btrfs_dir_inode_operations = { @@ -7301,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; static const struct inode_operations btrfs_special_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f87552a1d7e..a506a22b522 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int flags, oldflags; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -360,7 +363,8 @@ fail: } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid) + char *name, int namelen, u64 *async_transid, + bool readonly) { struct inode *inode; struct dentry *parent; @@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; pending_snapshot->root = root; + pending_snapshot->readonly = readonly; trans = btrfs_start_transaction(root->fs_info->extent_root, 5); if (IS_ERR(trans)) { @@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid) + u64 *async_transid, bool readonly) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent, if (snap_src) { error = create_snapshot(snap_src, dentry, - name, namelen, async_transid); + name, namelen, async_transid, readonly); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen, async_transid); @@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct page *page; + struct btrfs_super_block *disk_super; unsigned long last_index; unsigned long ra_pages = root->fs_info->bdi.ra_pages; unsigned long total_read = 0; + u64 features; u64 page_start; u64 page_end; u64 last_len = 0; @@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file, u64 defrag_end = 0; unsigned long i; int ret; + int compress_type = BTRFS_COMPRESS_ZLIB; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } if (inode->i_size == 0) return 0; @@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file, total_read++; mutex_lock(&inode->i_mutex); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = 1; + BTRFS_I(inode)->force_compress = compress_type; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) @@ -781,10 +796,17 @@ loop_unlock: atomic_dec(&root->fs_info->async_submit_draining); mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = 0; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; mutex_unlock(&inode->i_mutex); } + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (range->compress_type == BTRFS_COMPRESS_LZO) { + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); + } + return 0; err_reservations: @@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, - u64 *transid) + u64 *transid, + bool readonly) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct file *src_file; @@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid); + NULL, transid, readonly); } else { struct inode *src_inode; src_file = fget(fd); @@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid); + transid, readonly); fput(src_file); } out: @@ -946,58 +969,139 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol, - int v2) + void __user *arg, int subvol) { - struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; - char *name; - u64 fd; + struct btrfs_ioctl_vol_args *vol_args; int ret; - if (v2) { - u64 transid = 0; - u64 *ptr = NULL; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); - if (IS_ERR(vol_args_v2)) - return PTR_ERR(vol_args_v2); + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false); - if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out; - } - - name = vol_args_v2->name; - fd = vol_args_v2->fd; - vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + kfree(vol_args); + return ret; +} - if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) - ptr = &transid; +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, ptr); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) - ret = -EFAULT; - } else { - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - name = vol_args->name; - fd = vol_args->fd; - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, NULL); + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ret = -EOPNOTSUPP; + goto out; } + + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + ptr, readonly); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; out: kfree(vol_args); - kfree(vol_args_v2); + return ret; +} +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) + return -EINVAL; + + if (flags & ~BTRFS_SUBVOL_RDONLY) + return -EOPNOTSUPP; + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + else + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out: + up_write(&root->fs_info->subvol_sem); return ret; } @@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file) if (file->private_data) goto out; + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + ret = mnt_want_write(file->f_path.mnt); if (ret) goto out; @@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0, 0); + return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create(file, argp, 0, 1); + return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1, 0); + return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index c344d12c646..8fb382167b1 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args { }; #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { @@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args { */ __u32 extent_thresh; + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + /* spare for later */ - __u32 unused[5]; + __u32 unused[4]; }; struct btrfs_ioctl_space_info { @@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 00000000000..cc9b450399d --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,420 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/lzo.h> +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where compressed data goes */ + void *cbuf; /* where decompressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + ret = -1; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) + goto out; + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + ret = -1; + data_in = NULL; + goto done; + } + data_in = kmap(pages_in[page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed\n"); + ret = -1; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + if (data_in) + kunmap(pages_in[page_in_index]); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed!\n"); + ret = -1; + goto out; + } + + if (out_len < start_byte) { + ret = -1; + goto out; + } + + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr, workspace->buf + start_byte, bytes); + kunmap_atomic(kaddr, KM_USER0); +out: + return ret; +} + +struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ae7737e352c..2b61e1ddcd9 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio) + int type, int dio, int compress_type) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = inode; + entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0); + disk_len, type, 0, + BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1); + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 61dca83119d..ff1f69aa188 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -68,7 +68,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ @@ -93,6 +93,9 @@ struct btrfs_ordered_extent { /* flags (described above) */ unsigned long flags; + /* compression algorithm */ + int compress_type; + /* reference count */ atomic_t refs; @@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 883c6fa1367..b2130c46fdb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,90 @@ static const struct super_operations btrfs_super_ops; +static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + default: + if (nbuf) { + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +static void __save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; +} + +/* NOTE: + * We move write_super stuff at umount in order to avoid deadlock + * for umount hold all lock. + */ +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + __save_error_info(fs_info); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + sb->s_flags |= MS_RDONLY; + printk(KERN_INFO "btrfs is forced readonly\n"); + } +} + +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno) +{ + struct super_block *sb = fs_info->sb; + char nbuf[16]; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(fs_info); + + btrfs_handle_error(fs_info); +} + static void btrfs_put_super(struct super_block *sb) { struct btrfs_root *root = btrfs_sb(sb); @@ -69,9 +153,9 @@ enum { Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, - Opt_user_subvol_rm_allowed, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, }; static match_table_t tokens = { @@ -86,7 +170,9 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *p, *num, *orig; int intarg; int ret = 0; + char *compress_type; + bool compress_force = false; if (!options) return 0; @@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; - case Opt_compress: - printk(KERN_INFO "btrfs: use compression\n"); - btrfs_set_opt(info->mount_opt, COMPRESS); - break; case Opt_compress_force: - printk(KERN_INFO "btrfs: forcing compression\n"); - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + case Opt_compress_force_type: + compress_force = true; + case Opt_compress: + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + } else { + ret = -EINVAL; + goto out; + } + btrfs_set_opt(info->mount_opt, COMPRESS); + if (compress_force) { + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + pr_info("btrfs: force %s compression\n", + compress_type); + } else + pr_info("btrfs: use %s compression\n", + compress_type); break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); @@ -460,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_d_op = &btrfs_dentry_operations; sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; @@ -752,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; } +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1; + int i = 0, nr_devices; + int ret; + + nr_devices = fs_info->fs_devices->rw_devices; + BUG_ON(!nr_devices); + + devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + min_stripes = 4; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!device->in_fs_metadata) + continue; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in [0, skip_space - 1], + * we must subtract it from the total. In order to implement + * it, we account the used space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + return ret; + } + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * min_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - min_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); @@ -759,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 total_used_data = 0; + u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; + int ret; + /* holding chunk_muext to avoid allocating new chunks */ + mutex_lock(&root->fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | - BTRFS_BLOCK_GROUP_SYSTEM)) - total_used_data += found->disk_total; - else - total_used_data += found->disk_used; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + } + total_used += found->disk_used; } rcu_read_unlock(); @@ -777,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bavail = total_free_data; + ret = btrfs_calc_avail_data_space(root, &total_free_data); + if (ret) { + mutex_unlock(&root->fs_info->chunk_mutex); + return ret; + } + buf->f_bavail += total_free_data; + buf->f_bavail = buf->f_bavail >> bits; + mutex_unlock(&root->fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -896,10 +1136,14 @@ static int __init init_btrfs_fs(void) if (err) return err; - err = btrfs_init_cachep(); + err = btrfs_init_compress(); if (err) goto free_sysfs; + err = btrfs_init_cachep(); + if (err) + goto free_compress; + err = extent_io_init(); if (err) goto free_cachep; @@ -927,6 +1171,8 @@ free_extent_io: extent_io_exit(); free_cachep: btrfs_destroy_cachep(); +free_compress: + btrfs_exit_compress(); free_sysfs: btrfs_exit_sysfs(); return err; @@ -941,7 +1187,7 @@ static void __exit exit_btrfs_fs(void) unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); - btrfs_zlib_exit(); + btrfs_exit_compress(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f50e931fc21..bae5c7b8bbe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; int ret; + + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return ERR_PTR(-EROFS); again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 to_reserve = 0; u64 index = 0; u64 objectid; + u64 root_flags; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { @@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); btrfs_set_lock_blocking(old); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f104b57ad4e..229a594cacd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -62,6 +62,7 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reseration for relocation */ int error; + bool readonly; struct list_head list; }; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b988450783..d158530233b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -493,7 +494,7 @@ again: continue; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; fs_devices->open_devices--; } @@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); fs_devices->open_devices--; } if (device->writeable) { @@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; + flags |= FMODE_EXCL; + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) continue; - bdev = open_bdev_exclusive(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name, flags, holder); if (IS_ERR(bdev)) { printk(KERN_INFO "open %s failed\n", device->name); goto error; @@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) + if (!bh) { + ret = -EINVAL; goto error_close; + } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); error_close: - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, flags); error: continue; } @@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); - bdev = open_bdev_exclusive(path, flags, holder); + flags |= FMODE_EXCL; + bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); @@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error_close; bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: - close_bdev_exclusive(bdev, flags); + blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); return ret; } +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* + * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. */ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail) + u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; - u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; - int slot = 0; - int start_found; + int slot; struct extent_buffer *l; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - start_found = 0; - /* FIXME use last free of some kind */ /* we don't want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = 1024 * 1024; - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + if (root->fs_info->alloc_start + num_bytes <= search_end) search_start = max(root->fs_info->alloc_start, search_start); + max_hole_start = search_start; + max_hole_size = 0; + + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + path->reada = 2; + key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) - goto error; + goto out; if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) - goto error; - if (ret > 0) - start_found = 1; + goto out; } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, if (ret == 0) continue; if (ret < 0) - goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + goto out; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -812,48 +911,62 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; - if (hole_size > *max_avail) - *max_avail = hole_size; + if (key.offset > search_start) { + hole_size = key.offset - search_start; - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; } } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; next: path->slots[0]++; cond_resched(); } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation - */ - BUG_ON(*start < search_start); - if (*start + num_bytes > search_end) { - ret = -ENOSPC; - goto error; + hole_size = search_end- search_start; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; } - /* check for pending inserts here */ - ret = 0; -error: + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: btrfs_free_path(path); +error: + *start = max_hole_start; + if (len) + *len = max_hole_size; return ret; } @@ -1183,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = open_bdev_exclusive(device_path, FMODE_READ, - root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -1193,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -1251,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->latest_bdev = next_device->bdev; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; device->fs_devices->open_devices--; } @@ -1294,7 +1407,7 @@ error_brelse: brelse(bh); error_close: if (bdev) - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); @@ -1446,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -1572,7 +1686,7 @@ out: mutex_unlock(&root->fs_info->volume_mutex); return ret; error: - close_bdev_exclusive(bdev, 0); + blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -1912,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root) if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; @@ -2150,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size * num_stripes; } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes, u64 *stripe_size, - u64 start, u64 type) +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct list_head private_devs; - int min_stripe_size = 1 * 1024 * 1024; - u64 calc_size = 1024 * 1024 * 1024; - u64 max_chunk_size = calc_size; - u64 min_free; - u64 avail; - u64 max_avail = 0; - u64 dev_offset; - int num_stripes = 1; - int min_stripes = 1; - int sub_stripes = 0; - int looped = 0; - int ret; - int index; - int stripe_len = 64 * 1024; + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } - if (list_empty(&fs_devices->alloc_list)) - return -ENOSPC; +static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type, + int *num_stripes, int *min_stripes, + int *sub_stripes) +{ + *num_stripes = 1; + *min_stripes = 1; + *sub_stripes = 0; if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = fs_devices->rw_devices; - min_stripes = 2; + *num_stripes = fs_devices->rw_devices; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID1)) { if (fs_devices->rw_devices < 2) return -ENOSPC; - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 4) + *num_stripes = fs_devices->rw_devices; + if (*num_stripes < 4) return -ENOSPC; - num_stripes &= ~(u32)1; - sub_stripes = 2; - min_stripes = 4; + *num_stripes &= ~(u32)1; + *sub_stripes = 2; + *min_stripes = 4; } + return 0; +} + +static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, + u64 proposed_size, u64 type, + int num_stripes, int small_stripe) +{ + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = proposed_size; + u64 max_chunk_size = calc_size; + int ncopies = 1; + + if (type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) + ncopies = 2; + if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; @@ -2226,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); -again: - max_avail = 0; - if (!map || map->num_stripes != num_stripes) { - kfree(map); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) - return -ENOMEM; - map->num_stripes = num_stripes; - } - - if (calc_size * num_stripes > max_chunk_size) { - calc_size = max_chunk_size; + if (calc_size * num_stripes > max_chunk_size * ncopies) { + calc_size = max_chunk_size * ncopies; do_div(calc_size, num_stripes); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; } /* we don't want tiny stripes */ - if (!looped) + if (!small_stripe) calc_size = max_t(u64, min_stripe_size, calc_size); /* - * we're about to do_div by the stripe_len so lets make sure + * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure * we end up with something bigger than a stripe */ - calc_size = max_t(u64, calc_size, stripe_len * 4); + calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); + + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; + + return calc_size; +} + +static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, + int num_stripes) +{ + struct map_lookup *new; + size_t len = map_lookup_size(num_stripes); + + BUG_ON(map->num_stripes < num_stripes); + + if (map->num_stripes == num_stripes) + return map; + + new = kmalloc(len, GFP_NOFS); + if (!new) { + /* just change map->num_stripes */ + map->num_stripes = num_stripes; + return map; + } + + memcpy(new, map, len); + new->num_stripes = num_stripes; + kfree(map); + return new; +} + +/* + * helper to allocate device space from btrfs_device_info, in which we stored + * max free space information of every device. It is used when we can not + * allocate chunks by default size. + * + * By this helper, we can allocate a new chunk as larger as possible. + */ +static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_devices *fs_devices, + struct btrfs_device_info *devices, + int nr_device, u64 type, + struct map_lookup **map_lookup, + int min_stripes, u64 *stripe_size) +{ + int i, index, sort_again = 0; + int min_devices = min_stripes; + u64 max_avail, min_free; + struct map_lookup *map = *map_lookup; + int ret; + + if (nr_device < min_stripes) + return -ENOSPC; + + btrfs_descending_sort_devices(devices, nr_device); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + max_avail = devices[0].max_avail; + if (!max_avail) + return -ENOSPC; + + for (i = 0; i < nr_device; i++) { + /* + * if dev_offset = 0, it means the free space of this device + * is less than what we need, and we didn't search max avail + * extent on this device, so do it now. + */ + if (!devices[i].dev_offset) { + ret = find_free_dev_extent(trans, devices[i].dev, + max_avail, + &devices[i].dev_offset, + &devices[i].max_avail); + if (ret != 0 && ret != -ENOSPC) + return ret; + sort_again = 1; + } + } + + /* we update the max avail free extent of each devices, sort again */ + if (sort_again) + btrfs_descending_sort_devices(devices, nr_device); + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_devices = 1; + + if (!devices[min_devices - 1].max_avail) + return -ENOSPC; + + max_avail = devices[min_devices - 1].max_avail; + if (type & BTRFS_BLOCK_GROUP_DUP) + do_div(max_avail, 2); + + max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, + min_stripes, 1); + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = max_avail * 2; + else + min_free = max_avail; + + if (min_free > devices[min_devices - 1].max_avail) + return -ENOSPC; + + map = __shrink_map_lookup_stripes(map, min_stripes); + *stripe_size = max_avail; + + index = 0; + for (i = 0; i < min_stripes; i++) { + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset; + if (type & BTRFS_BLOCK_GROUP_DUP) { + i++; + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset + + max_avail; + } + index++; + } + *map_lookup = map; + + return 0; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info; + struct list_head private_devs; + u64 calc_size = 1024 * 1024 * 1024; + u64 min_free; + u64 avail; + u64 dev_offset; + int num_stripes; + int min_stripes; + int sub_stripes; + int min_devices; /* the min number of devices we need */ + int i; + int ret; + int index; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, + &min_stripes, &sub_stripes); + if (ret) + return ret; + + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; cur = fs_devices->alloc_list.next; index = 0; + i = 0; - if (type & BTRFS_BLOCK_GROUP_DUP) + calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, + num_stripes, 0); + + if (type & BTRFS_BLOCK_GROUP_DUP) { min_free = calc_size * 2; - else + min_devices = 1; + } else { min_free = calc_size; - - /* - * we add 1MB because we never use the first 1MB of the device, unless - * we've looped, then we are likely allocating the maximum amount of - * space left already - */ - if (!looped) - min_free += 1024 * 1024; + min_devices = min_stripes; + } INIT_LIST_HEAD(&private_devs); while (index < num_stripes) { @@ -2283,27 +2559,39 @@ again: cur = cur->next; if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); + ret = find_free_dev_extent(trans, device, min_free, + &devices_info[i].dev_offset, + &devices_info[i].max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); map->stripes[index].dev = device; - map->stripes[index].physical = dev_offset; + map->stripes[index].physical = + devices_info[i].dev_offset; index++; if (type & BTRFS_BLOCK_GROUP_DUP) { map->stripes[index].dev = device; map->stripes[index].physical = - dev_offset + calc_size; + devices_info[i].dev_offset + + calc_size; index++; } - } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; + } else if (ret != -ENOSPC) + goto error; + + devices_info[i].dev = device; + i++; + } else if (device->in_fs_metadata && + avail >= BTRFS_STRIPE_LEN) { + devices_info[i].dev = device; + devices_info[i].max_avail = avail; + i++; + } + if (cur == &fs_devices->alloc_list) break; } + list_splice(&private_devs, &fs_devices->alloc_list); if (index < num_stripes) { if (index >= min_stripes) { @@ -2312,34 +2600,36 @@ again: num_stripes /= sub_stripes; num_stripes *= sub_stripes; } - looped = 1; - goto again; - } - if (!looped && max_avail > 0) { - looped = 1; - calc_size = max_avail; - goto again; + + map = __shrink_map_lookup_stripes(map, num_stripes); + } else if (i >= min_devices) { + ret = __btrfs_alloc_tiny_space(trans, fs_devices, + devices_info, i, type, + &map, min_stripes, + &calc_size); + if (ret) + goto error; + } else { + ret = -ENOSPC; + goto error; } - kfree(map); - return -ENOSPC; } map->sector_size = extent_root->sectorsize; - map->stripe_len = stripe_len; - map->io_align = stripe_len; - map->io_width = stripe_len; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->num_stripes = num_stripes; map->sub_stripes = sub_stripes; *map_ret = map; *stripe_size = calc_size; *num_bytes = chunk_bytes_by_type(type, calc_size, - num_stripes, sub_stripes); + map->num_stripes, sub_stripes); em = alloc_extent_map(GFP_NOFS); if (!em) { - kfree(map); - return -ENOMEM; + ret = -ENOMEM; + goto error; } em->bdev = (struct block_device *)map; em->start = start; @@ -2372,7 +2662,13 @@ again: index++; } + kfree(devices_info); return 0; + +error: + kfree(map); + kfree(devices_info); + return ret; } static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2740db49eb0..7fb59d45fe8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,8 +20,11 @@ #define __BTRFS_VOLUMES_ #include <linux/bio.h> +#include <linux/sort.h> #include "async-thread.h" +#define BTRFS_STRIPE_LEN (64 * 1024) + struct buffer_head; struct btrfs_pending_bios { struct bio *head; @@ -50,7 +53,7 @@ struct btrfs_device { struct block_device *bdev; - /* the mode sent to open_bdev_exclusive */ + /* the mode sent to blkdev_get */ fmode_t mode; char *name; @@ -136,6 +139,30 @@ struct btrfs_multi_bio { struct btrfs_bio_stripe stripes[]; }; +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; +}; + +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); + #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 698fdd2c739..a5776531dc2 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, int btrfs_removexattr(struct dentry *dentry, const char *name) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b9cd5445f71..f5ec2d44150 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -32,15 +32,6 @@ #include <linux/bio.h> #include "compression.h" -/* Plan: call deflate() with avail_in == *sourcelen, - avail_out = *dstlen - 12 and flush == Z_FINISH. - If it doesn't manage to finish, call it again with - avail_in == 0 and avail_out set to the remaining 12 - bytes for it to clean up. - Q: Is 12 bytes sufficient? -*/ -#define STREAM_END_SPACE 12 - struct workspace { z_stream inf_strm; z_stream def_strm; @@ -48,152 +39,51 @@ struct workspace { struct list_head list; }; -static LIST_HEAD(idle_workspace); -static DEFINE_SPINLOCK(workspace_lock); -static unsigned long num_workspace; -static atomic_t alloc_workspace = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); +static void zlib_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); -/* - * this finds an available zlib workspace or allocates a new one - * NULL or an ERR_PTR is returned if things go bad. - */ -static struct workspace *find_zlib_workspace(void) + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; - int ret; - int cpus = num_online_cpus(); - -again: - spin_lock(&workspace_lock); - if (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - num_workspace--; - spin_unlock(&workspace_lock); - return workspace; - } - spin_unlock(&workspace_lock); - if (atomic_read(&alloc_workspace) > cpus) { - DEFINE_WAIT(wait); - prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(&alloc_workspace) > cpus) - schedule(); - finish_wait(&workspace_wait, &wait); - goto again; - } - atomic_inc(&alloc_workspace); workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) { - ret = -ENOMEM; - goto fail; - } + if (!workspace) + return ERR_PTR(-ENOMEM); workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); - if (!workspace->def_strm.workspace) { - ret = -ENOMEM; - goto fail; - } workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); - if (!workspace->inf_strm.workspace) { - ret = -ENOMEM; - goto fail_inflate; - } workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->buf) { - ret = -ENOMEM; - goto fail_kmalloc; - } - return workspace; - -fail_kmalloc: - vfree(workspace->inf_strm.workspace); -fail_inflate: - vfree(workspace->def_strm.workspace); -fail: - kfree(workspace); - atomic_dec(&alloc_workspace); - wake_up(&workspace_wait); - return ERR_PTR(ret); -} - -/* - * put a workspace struct back on the list or free it if we have enough - * idle ones sitting around - */ -static int free_workspace(struct workspace *workspace) -{ - spin_lock(&workspace_lock); - if (num_workspace < num_online_cpus()) { - list_add_tail(&workspace->list, &idle_workspace); - num_workspace++; - spin_unlock(&workspace_lock); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; - } - spin_unlock(&workspace_lock); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); + if (!workspace->def_strm.workspace || + !workspace->inf_strm.workspace || !workspace->buf) + goto fail; - atomic_dec(&alloc_workspace); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; -} + INIT_LIST_HEAD(&workspace->list); -/* - * cleanup function for module exit - */ -static void free_workspaces(void) -{ - struct workspace *workspace; - while (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); - atomic_dec(&alloc_workspace); - } + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); } -/* - * given an address space and start/len, compress the bytes. - * - * pages are allocated to hold the compressed result and stored - * in 'pages' - * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we - * ran out of room in the pages array or because we cross the - * max_out threshold. - * - * total_out is used to return the total number of compressed bytes - * - * max_out tells us the max number of bytes that we're allowed to - * stuff into pages - */ -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - struct workspace *workspace; char *data_in; char *cpage_out; int nr_pages = 0; @@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_out = 0; *total_in = 0; - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -1; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "deflateInit failed\n"); ret = -1; @@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; @@ -314,55 +208,26 @@ out: kunmap(in_page); page_cache_release(in_page); } - free_workspace(workspace); return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) { - int ret = 0; + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; int wbits = MAX_WBITS; - struct workspace *workspace; char *data_in; size_t total_out = 0; - unsigned long page_bytes_left; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - struct page *page_out; unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - unsigned long working_bytes; unsigned long pg_offset; - unsigned long start_byte; - unsigned long current_buf_start; - char *kaddr; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; data_in = kmap(pages_in[page_in_index]); workspace->inf_strm.next_in = data_in; @@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, workspace->inf_strm.total_out = 0; workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - page_out = bvec[page_out_index].bv_page; - page_bytes_left = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; - /* - * buf start is the byte offset we're of the start of - * our workspace buffer - */ - buf_start = total_out; - /* total_out is the last byte of the workspace buffer */ + buf_start = total_out; total_out = workspace->inf_strm.total_out; - working_bytes = total_out - buf_start; - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(page_out) - disk_start; - - if (working_bytes == 0) { - /* we didn't make progress in this inflate - * call, we're done - */ - if (ret != Z_STREAM_END) - ret = -1; + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) break; - } - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - goto next; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, - bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page_out); - - pg_offset += bytes; - page_bytes_left -= bytes; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; - - /* check if we need to pick another page */ - if (page_bytes_left == 0) { - page_out_index++; - if (page_out_index >= vcnt) { - ret = 0; - goto done; - } - - page_out = bvec[page_out_index].bv_page; - pg_offset = 0; - page_bytes_left = PAGE_CACHE_SIZE; - start_byte = page_offset(page_out) - disk_start; - - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - goto next; - - /* the next page in the biovec might not - * be adjacent to the last page, but it - * might still be found inside this working - * buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + - buf_offset; - } - } + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; } -next: + workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; @@ -516,35 +301,21 @@ done: zlib_inflateEnd(&workspace->inf_strm); if (data_in) kunmap(pages_in[page_in_index]); -out: - free_workspace(workspace); return ret; } -/* - * a less complex decompression routine. Our compressed data fits in a - * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in - */ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - struct workspace *workspace; unsigned long bytes_left = destlen; unsigned long total_out = 0; char *kaddr; - if (destlen > PAGE_CACHE_SIZE) - return -ENOMEM; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; - workspace->inf_strm.next_in = data_in; workspace->inf_strm.avail_in = srclen; workspace->inf_strm.total_in = 0; @@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (bytes_left > 0) { @@ -616,12 +386,13 @@ next: ret = 0; zlib_inflateEnd(&workspace->inf_strm); -out: - free_workspace(workspace); return ret; } -void btrfs_zlib_exit(void) -{ - free_workspaces(); -} +struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; diff --git a/fs/buffer.c b/fs/buffer.c index 5930e382959..2219a76e2ca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void) static void bh_lru_install(struct buffer_head *bh) { struct buffer_head *evictee = NULL; - struct bh_lru *lru; check_irqs_on(); bh_lru_lock(); - lru = &__get_cpu_var(bh_lrus); - if (lru->bhs[0] != bh) { + if (__this_cpu_read(bh_lrus.bhs[0]) != bh) { struct buffer_head *bhs[BH_LRU_SIZE]; int in; int out = 0; @@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh) get_bh(bh); bhs[out++] = bh; for (in = 0; in < BH_LRU_SIZE; in++) { - struct buffer_head *bh2 = lru->bhs[in]; + struct buffer_head *bh2 = + __this_cpu_read(bh_lrus.bhs[in]); if (bh2 == bh) { __brelse(bh2); @@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh) } while (out < BH_LRU_SIZE) bhs[out++] = NULL; - memcpy(lru->bhs, bhs, sizeof(bhs)); + memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } bh_lru_unlock(); @@ -1313,23 +1312,22 @@ static struct buffer_head * lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *ret = NULL; - struct bh_lru *lru; unsigned int i; check_irqs_on(); bh_lru_lock(); - lru = &__get_cpu_var(bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { - struct buffer_head *bh = lru->bhs[i]; + struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); if (bh && bh->b_bdev == bdev && bh->b_blocknr == block && bh->b_size == size) { if (i) { while (i) { - lru->bhs[i] = lru->bhs[i - 1]; + __this_cpu_write(bh_lrus.bhs[i], + __this_cpu_read(bh_lrus.bhs[i - 1])); i--; } - lru->bhs[0] = bh; + __this_cpu_write(bh_lrus.bhs[0], bh); } get_bh(bh); ret = bh; @@ -3203,22 +3201,23 @@ static void recalc_bh_state(void) int i; int tot = 0; - if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) + if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096) return; - __get_cpu_var(bh_accounting).ratelimit = 0; + __this_cpu_write(bh_accounting.ratelimit, 0); for_each_online_cpu(i) tot += per_cpu(bh_accounting, i).nr; buffer_heads_over_limit = (tot > max_buffer_heads); } - + struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) { struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); - get_cpu_var(bh_accounting).nr++; + preempt_disable(); + __this_cpu_inc(bh_accounting.nr); recalc_bh_state(); - put_cpu_var(bh_accounting); + preempt_enable(); } return ret; } @@ -3228,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh) { BUG_ON(!list_empty(&bh->b_assoc_buffers)); kmem_cache_free(bh_cachep, bh); - get_cpu_var(bh_accounting).nr--; + preempt_disable(); + __this_cpu_dec(bh_accounting.nr); recalc_bh_state(); - put_cpu_var(bh_accounting); + preempt_enable(); } EXPORT_SYMBOL(free_buffer_head); @@ -3243,9 +3243,8 @@ static void buffer_exit_cpu(int cpu) brelse(b->bhs[i]); b->bhs[i] = NULL; } - get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; + this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); per_cpu(bh_accounting, cpu).nr = 0; - put_cpu_var(bh_accounting); } static int buffer_cpu_notify(struct notifier_block *self, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 562f9884a4d..0bc68de8edd 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -42,11 +42,11 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - dentry->d_op = &ceph_dentry_ops; + d_set_d_op(dentry, &ceph_dentry_ops); else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - dentry->d_op = &ceph_snapdir_dentry_ops; + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); else - dentry->d_op = &ceph_snap_dentry_ops; + d_set_d_op(dentry, &ceph_snap_dentry_ops); di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) @@ -112,7 +112,7 @@ static int __dcache_readdir(struct file *filp, dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, last); - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); /* start at beginning? */ if (filp->f_pos == 2 || last == NULL || @@ -136,6 +136,7 @@ more: fi->at_end = 1; goto out_unlock; } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (!d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && @@ -145,13 +146,15 @@ more: dentry->d_name.len, dentry->d_name.name, di->offset, filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", !dentry->d_inode ? " null" : ""); + spin_unlock(&dentry->d_lock); p = p->prev; dentry = list_entry(p, struct dentry, d_u.d_child); di = ceph_dentry(dentry); } - atomic_inc(&dentry->d_count); - spin_unlock(&dcache_lock); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -177,19 +180,19 @@ more: filp->f_pos++; - /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ + /* make sure a dentry wasn't dropped while we didn't have parent lock */ if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; out_unlock: - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); out: if (last) dput(last); @@ -987,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) */ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + dir = dentry->d_parent->d_inode; dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 50001de66c6..5625463aa47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -370,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb) return &ci->vfs_inode; } +static void ceph_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ceph_inode_info *ci = ceph_inode(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ceph_inode_cachep, ci); +} + void ceph_destroy_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -409,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); - kmem_cache_free(ceph_inode_cachep, ci); + call_rcu(&inode->i_rcu, ceph_i_callback); } @@ -841,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn) di->offset = ceph_inode(inode)->i_max_offset++; spin_unlock(&inode->i_lock); - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); + spin_lock(&dir->d_lock); + spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); list_move(&dn->d_u.d_child, &dir->d_subdirs); dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, dn->d_u.d_child.prev, dn->d_u.d_child.next); spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&dir->d_lock); } /* @@ -879,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " "inode %p ino %llx.%llx\n", - dn, atomic_read(&dn->d_count), - realdn, atomic_read(&realdn->d_count), + dn, dn->d_count, + realdn, realdn->d_count, realdn->d_inode, ceph_vinop(realdn->d_inode)); dput(dn); dn = realdn; @@ -1231,11 +1240,11 @@ retry_lookup: goto retry_lookup; } else { /* reorder parent's d_subdirs */ - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); + spin_lock(&parent->d_lock); + spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); list_move(&dn->d_u.d_child, &parent->d_subdirs); spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } di = dn->d_fsdata; @@ -1772,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask) * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct inode *inode, int mask) +int ceph_permission(struct inode *inode, int mask, unsigned int flags) { - int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + int err; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); if (!err) - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask, flags, NULL); return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a6949cc7c69..a1ee8fa3a8e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1502,7 +1502,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, atomic_read(&dentry->d_count), *base, len, path); + dentry, dentry->d_count, *base, len, path); return path; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6e082669511..20b907d76ae 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -666,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode); extern void ceph_queue_writeback(struct inode *inode); extern int ceph_do_getattr(struct inode *inode, int mask); -extern int ceph_permission(struct inode *inode, int mask); +extern int ceph_permission(struct inode *inode, int mask, unsigned int flags); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/char_dev.c b/fs/char_dev.c index e5b9df993b9..dca9e5e0f73 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -59,7 +59,7 @@ static struct char_device_struct { } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ -static inline int major_to_index(int major) +static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } @@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp) return ret; } -int cdev_index(struct inode *inode) -{ - int idx; - struct kobject *kobj; - - kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); - if (!kobj) - return -1; - kobject_put(kobj); - return idx; -} - void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); @@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); -EXPORT_SYMBOL(cdev_index); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 224d7bbd1fc..e654dfd092c 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, void *buffer, uint16_t maxbuf) { const struct TCP_Server_Info *server = cookie_netfs_data; - const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; + const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; + const struct sockaddr_in *addr = (struct sockaddr_in *) sa; + const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; struct cifs_server_key *key = buffer; uint16_t key_len = sizeof(struct cifs_server_key); @@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, */ switch (sa->sa_family) { case AF_INET: - key->family = server->addr.sockAddr.sin_family; - key->port = server->addr.sockAddr.sin_port; - key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; + key->family = sa->sa_family; + key->port = addr->sin_port; + key->addr[0].ipv4_addr = addr->sin_addr; key_len += sizeof(key->addr[0].ipv4_addr); break; case AF_INET6: - key->family = server->addr.sockAddr6.sin6_family; - key->port = server->addr.sockAddr6.sin6_port; - key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; + key->family = sa->sa_family; + key->port = addr6->sin6_port; + key->addr[0].ipv6_addr = addr6->sin6_addr; key_len += sizeof(key->addr[0].ipv6_addr); break; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 103ab8b605b..65829d32128 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", + cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d", mid_entry->midState, (int)mid_entry->command, mid_entry->pid, - mid_entry->tsk, + mid_entry->callback_data, mid_entry->mid); #ifdef CONFIG_CIFS_STATS2 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", @@ -119,29 +119,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) "Display Internal CIFS Data Structures for Debugging\n" "---------------------------------------------------\n"); seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); - seq_printf(m, "Features: "); + seq_printf(m, "Features:"); #ifdef CONFIG_CIFS_DFS_UPCALL - seq_printf(m, "dfs"); - seq_putc(m, ' '); + seq_printf(m, " dfs"); #endif #ifdef CONFIG_CIFS_FSCACHE - seq_printf(m, "fscache"); - seq_putc(m, ' '); + seq_printf(m, " fscache"); #endif #ifdef CONFIG_CIFS_WEAK_PW_HASH - seq_printf(m, "lanman"); - seq_putc(m, ' '); + seq_printf(m, " lanman"); #endif #ifdef CONFIG_CIFS_POSIX - seq_printf(m, "posix"); - seq_putc(m, ' '); + seq_printf(m, " posix"); #endif #ifdef CONFIG_CIFS_UPCALL - seq_printf(m, "spnego"); - seq_putc(m, ' '); + seq_printf(m, " spnego"); #endif #ifdef CONFIG_CIFS_XATTR - seq_printf(m, "xattr"); + seq_printf(m, " xattr"); +#endif +#ifdef CONFIG_CIFS_ACL + seq_printf(m, " acl"); #endif seq_putc(m, '\n'); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); @@ -220,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) mid_entry = list_entry(tmp3, struct mid_q_entry, qhead); seq_printf(m, "\tState: %d com: %d pid:" - " %d tsk: %p mid %d\n", + " %d cbdata: %p mid %d\n", mid_entry->midState, (int)mid_entry->command, mid_entry->pid, - mid_entry->tsk, + mid_entry->callback_data, mid_entry->mid); } spin_unlock(&GlobalMid_Lock); @@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) atomic_read(&totSmBufAllocCount)); #endif /* CONFIG_CIFS_STATS2 */ - seq_printf(m, "Operations (MIDs): %d\n", midCount.counter); + seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount)); seq_printf(m, "\n%d session %d share reconnects\n", tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index c68a056f27f..f1c68629f27 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, } -static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, - struct list_head *mntlist) -{ - /* stolen from afs code */ - int err; - - mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist); - switch (err) { - case 0: - path_put(&nd->path); - nd->path.mnt = newmnt; - nd->path.dentry = dget(newmnt->mnt_root); - schedule_delayed_work(&cifs_dfs_automount_task, - cifs_dfs_mountpoint_expiry_timeout); - break; - case -EBUSY: - /* someone else made a mount here whilst we were busy */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) - ; - err = 0; - default: - mntput(newmnt); - break; - } - return err; -} - static void dump_referral(const struct dfs_info3_param *ref) { cFYI(1, "DFS: ref path: %s", ref->path_name); @@ -293,45 +264,42 @@ static void dump_referral(const struct dfs_info3_param *ref) ref->path_consumed); } - -static void* -cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +/* + * Create a vfsmount that we can automount + */ +static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) { struct dfs_info3_param *referrals = NULL; unsigned int num_referrals = 0; struct cifs_sb_info *cifs_sb; struct cifsSesInfo *ses; - char *full_path = NULL; + char *full_path; int xid, i; - int rc = 0; - struct vfsmount *mnt = ERR_PTR(-ENOENT); + int rc; + struct vfsmount *mnt; struct tcon_link *tlink; cFYI(1, "in %s", __func__); - BUG_ON(IS_ROOT(dentry)); + BUG_ON(IS_ROOT(mntpt)); xid = GetXid(); - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); - /* * The MSDFS spec states that paths in DFS referral requests and * responses must be prefixed by a single '\' character instead of * the double backslashes usually used in the UNC. This function * gives us the latter, so we must adjust the result. */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; - goto out_err; - } + mnt = ERR_PTR(-ENOMEM); + full_path = build_path_from_dentry(mntpt); + if (full_path == NULL) + goto free_xid; - cifs_sb = CIFS_SB(dentry->d_inode->i_sb); + cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - goto out_err; + mnt = ERR_CAST(tlink); + goto free_full_path; } ses = tlink_tcon(tlink)->ses; @@ -341,46 +309,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) cifs_put_tlink(tlink); + mnt = ERR_PTR(-ENOENT); for (i = 0; i < num_referrals; i++) { int len; - dump_referral(referrals+i); + dump_referral(referrals + i); /* connect to a node */ len = strlen(referrals[i].node_name); if (len < 2) { cERROR(1, "%s: Net Address path too short: %s", __func__, referrals[i].node_name); - rc = -EINVAL; - goto out_err; + mnt = ERR_PTR(-EINVAL); + break; } mnt = cifs_dfs_do_refmount(cifs_sb, full_path, referrals + i); cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, referrals[i].node_name, mnt); - - /* complete mount procedure if we accured submount */ if (!IS_ERR(mnt)) - break; + goto success; } - /* we need it cause for() above could exit without valid submount */ - rc = PTR_ERR(mnt); - if (IS_ERR(mnt)) - goto out_err; - - rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); + /* no valid submounts were found; return error from get_dfs_path() by + * preference */ + if (rc != 0) + mnt = ERR_PTR(rc); -out: - FreeXid(xid); +success: free_dfs_info_array(referrals, num_referrals); +free_full_path: kfree(full_path); +free_xid: + FreeXid(xid); cFYI(1, "leaving %s" , __func__); - return ERR_PTR(rc); -out_err: - path_put(&nd->path); - goto out; + return mnt; +} + +/* + * Attempt to automount the referral + */ +struct vfsmount *cifs_dfs_d_automount(struct path *path) +{ + struct vfsmount *newmnt; + + cFYI(1, "in %s", __func__); + + newmnt = cifs_dfs_do_automount(path->dentry); + if (IS_ERR(newmnt)) { + cFYI(1, "leaving %s [automount failed]" , __func__); + return newmnt; + } + + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &cifs_dfs_automount_list); + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); + cFYI(1, "leaving %s [ok]" , __func__); + return newmnt; } const struct inode_operations cifs_dfs_referral_inode_operations = { - .follow_link = cifs_dfs_follow_mountpoint, }; - diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 7852cd67705..ac51cd2d33a 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -40,6 +40,7 @@ #define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ #define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ +#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 87044906cd1..4dfba828316 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -98,6 +98,8 @@ struct key * cifs_get_spnego_key(struct cifsSesInfo *sesInfo) { struct TCP_Server_Info *server = sesInfo->server; + struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; char *description, *dp; size_t desc_len; struct key *spnego_key; @@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) dp = description + strlen(description); /* add the server address */ - if (server->addr.sockAddr.sin_family == AF_INET) - sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); - else if (server->addr.sockAddr.sin_family == AF_INET6) - sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); + if (server->dstaddr.ss_family == AF_INET) + sprintf(dp, "ip4=%pI4", &sa->sin_addr); + else if (server->dstaddr.ss_family == AF_INET6) + sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); else goto out; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 430f510a172..fc0fd4fde30 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes, int charlen, outlen = 0; int maxwords = maxbytes / 2; char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp; - for (i = 0; i < maxwords && from[i]; i++) { - charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, - NLS_MAX_CHARSET_SIZE); + for (i = 0; i < maxwords; i++) { + ftmp = get_unaligned_le16(&from[i]); + if (ftmp == 0) + break; + + charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); if (charlen > 0) outlen += charlen; else @@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes, } /* - * cifs_mapchar - convert a little-endian char to proper char in codepage + * cifs_mapchar - convert a host-endian char to proper char in codepage * @target - where converted character should be copied - * @src_char - 2 byte little-endian source character + * @src_char - 2 byte host-endian source character * @cp - codepage to which character should be converted * @mapchar - should character be mapped according to mapchars mount option? * @@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes, * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). */ static int -cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, bool mapchar) { int len = 1; @@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp, * build_path_from_dentry are modified, as they use slash as * separator. */ - switch (le16_to_cpu(src_char)) { + switch (src_char) { case UNI_COLON: *target = ':'; break; @@ -109,8 +113,7 @@ out: return len; cp_convert: - len = cp->uni2char(le16_to_cpu(src_char), target, - NLS_MAX_CHARSET_SIZE); + len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); if (len <= 0) { *target = '?'; len = 1; @@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp; /* * because the chars can be of varying widths, we need to take care @@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, */ safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); - for (i = 0; i < fromwords && from[i]; i++) { + for (i = 0; i < fromwords; i++) { + ftmp = get_unaligned_le16(&from[i]); + if (ftmp == 0) + break; + /* * check to see if converting this character might make the * conversion bleed into the null terminator */ if (outlen >= safelen) { - charlen = cifs_mapchar(tmp, from[i], codepage, mapchar); + charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar); if ((outlen + charlen) > (tolen - nullsize)) break; } /* put converted char into 'to' buffer */ - charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar); + charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); outlen += charlen; } @@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len, { int charlen; int i; - wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */ + wchar_t wchar_to; /* needed to quiet sparse */ for (i = 0; len && *from; i++, from += charlen, len -= charlen) { - - /* works for 2.4.0 kernel or later */ - charlen = codepage->char2uni(from, len, &wchar_to[i]); + charlen = codepage->char2uni(from, len, &wchar_to); if (charlen < 1) { - cERROR(1, "strtoUCS: char2uni of %d returned %d", - (int)*from, charlen); + cERROR(1, "strtoUCS: char2uni of 0x%x returned %d", + *from, charlen); /* A question mark */ - to[i] = cpu_to_le16(0x003f); + wchar_to = 0x003f; charlen = 1; - } else - to[i] = cpu_to_le16(wchar_to[i]); - + } + put_unaligned_le16(wchar_to, &to[i]); } - to[i] = 0; + put_unaligned_le16(0, &to[i]); return i; } @@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, return dst; } +/* + * Convert 16 bit Unicode pathname to wire format from string in current code + * page. Conversion may involve remapping up the six characters that are + * only legal in POSIX-like OS (if they are present in the string). Path + * names are little endian 16 bit Unicode on the wire + */ +int +cifsConvertToUCS(__le16 *target, const char *source, int maxlen, + const struct nls_table *cp, int mapChars) +{ + int i, j, charlen; + int len_remaining = maxlen; + char src_char; + __u16 temp; + + if (!mapChars) + return cifs_strtoUCS(target, source, PATH_MAX, cp); + + for (i = 0, j = 0; i < maxlen; j++) { + src_char = source[i]; + switch (src_char) { + case 0: + put_unaligned_le16(0, &target[j]); + goto ctoUCS_out; + case ':': + temp = UNI_COLON; + break; + case '*': + temp = UNI_ASTERIK; + break; + case '?': + temp = UNI_QUESTION; + break; + case '<': + temp = UNI_LESSTHAN; + break; + case '>': + temp = UNI_GRTRTHAN; + break; + case '|': + temp = UNI_PIPE; + break; + /* + * FIXME: We can not handle remapping backslash (UNI_SLASH) + * until all the calls to build_path_from_dentry are modified, + * as they use backslash as separator. + */ + default: + charlen = cp->char2uni(source+i, len_remaining, + &temp); + /* + * if no match, use question mark, which at least in + * some cases serves as wild card + */ + if (charlen < 1) { + temp = 0x003f; + charlen = 1; + } + len_remaining -= charlen; + /* + * character may take more than one byte in the source + * string, but will take exactly two bytes in the + * target string + */ + i += charlen; + continue; + } + put_unaligned_le16(temp, &target[j]); + i++; /* move to next char in source string */ + len_remaining--; + } + +ctoUCS_out: + return i; +} + diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index a437ec391a0..1e7636b145a 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { ; -/* security id for everyone */ +/* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; +/* security id for Authenticated Users system group */ +static const struct cifs_sid sid_authusers = { + 1, 1, {0, 0, 0, 0, 0, 5}, {11} }; /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; @@ -365,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, if (num_aces > 0) { umode_t user_mask = S_IRWXU; umode_t group_mask = S_IRWXG; - umode_t other_mask = S_IRWXO; + umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), GFP_KERNEL); @@ -390,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, ppace[i]->type, &fattr->cf_mode, &other_mask); + if (compare_sids(&(ppace[i]->sid), &sid_authusers)) + access_flags_to_mode(ppace[i]->access_req, + ppace[i]->type, + &fattr->cf_mode, + &other_mask); + /* memcpy((void *)(&(cifscred->aces[i])), (void *)ppace[i], diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index f856732161a..66f3d50d067 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -72,6 +72,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, return 0; } +/* must be called with server->srv_mutex held */ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -84,14 +85,12 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) return rc; - spin_lock(&GlobalMid_Lock); cifs_pdu->Signature.Sequence.SequenceNumber = cpu_to_le32(server->sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - spin_unlock(&GlobalMid_Lock); rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); if (rc) @@ -149,6 +148,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, return rc; } +/* must be called with server->srv_mutex held */ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -162,14 +162,12 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) return rc; - spin_lock(&GlobalMid_Lock); cifs_pdu->Signature.Sequence.SequenceNumber = cpu_to_le32(server->sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - spin_unlock(&GlobalMid_Lock); rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); if (rc) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3936aa7f2c2..a8323f1dc1c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, int, 0); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 50 Range: 2 to 256"); - +unsigned short echo_retries = 5; +module_param(echo_retries, ushort, 0644); +MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " + "reconnecting server. Default: 5. 0 means " + "never reconnect."); extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data, goto out_no_root; } + /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */ + if (cifs_sb_master_tcon(cifs_sb)->nocase) + sb->s_d_op = &cifs_ci_dentry_ops; + else + sb->s_d_op = &cifs_dentry_ops; + #ifdef CONFIG_CIFS_EXPERIMENTAL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cFYI(1, "export ops supported"); @@ -283,10 +293,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int cifs_permission(struct inode *inode, int mask) +static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { @@ -298,7 +311,7 @@ static int cifs_permission(struct inode *inode, int mask) on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } static struct kmem_cache *cifs_inode_cachep; @@ -326,6 +339,8 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->invalid_mapping = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; + cifs_inode->uniqueid = 0; + cifs_inode->createtime = 0; /* Can not set i_flags here - they get immediately overwritten to zero by the VFS */ @@ -334,10 +349,17 @@ cifs_alloc_inode(struct super_block *sb) return &cifs_inode->vfs_inode; } +static void cifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); +} + static void cifs_destroy_inode(struct inode *inode) { - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); + call_rcu(&inode->i_rcu, cifs_i_callback); } static void @@ -351,18 +373,19 @@ cifs_evict_inode(struct inode *inode) static void cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) { + struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; + seq_printf(s, ",addr="); - switch (server->addr.sockAddr.sin_family) { + switch (server->dstaddr.ss_family) { case AF_INET: - seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr); + seq_printf(s, "%pI4", &sa->sin_addr.s_addr); break; case AF_INET6: - seq_printf(s, "%pI6", - &server->addr.sockAddr6.sin6_addr.s6_addr); - if (server->addr.sockAddr6.sin6_scope_id) - seq_printf(s, "%%%u", - server->addr.sockAddr6.sin6_scope_id); + seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr); + if (sa6->sin6_scope_id) + seq_printf(s, "%%%u", sa6->sin6_scope_id); break; default: seq_printf(s, "(unknown)"); @@ -710,6 +733,25 @@ const struct file_operations cifs_file_ops = { .setlease = cifs_setlease, }; +const struct file_operations cifs_file_strict_ops = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_strict_readv, + .aio_write = cifs_file_aio_write, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_strict_fsync, + .flush = cifs_flush, + .mmap = cifs_file_strict_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, +}; + const struct file_operations cifs_file_direct_ops = { /* no aio, no readv - BB reevaluate whether they can be done with directio, no cache */ @@ -728,6 +770,7 @@ const struct file_operations cifs_file_direct_ops = { .llseek = cifs_llseek, .setlease = cifs_setlease, }; + const struct file_operations cifs_file_nobrl_ops = { .read = do_sync_read, .write = do_sync_write, @@ -746,6 +789,24 @@ const struct file_operations cifs_file_nobrl_ops = { .setlease = cifs_setlease, }; +const struct file_operations cifs_file_strict_nobrl_ops = { + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_strict_readv, + .aio_write = cifs_file_aio_write, + .open = cifs_open, + .release = cifs_close, + .fsync = cifs_strict_fsync, + .flush = cifs_flush, + .mmap = cifs_file_strict_mmap, + .splice_read = generic_file_splice_read, + .llseek = cifs_llseek, +#ifdef CONFIG_CIFS_POSIX + .unlocked_ioctl = cifs_ioctl, +#endif /* CONFIG_CIFS_POSIX */ + .setlease = cifs_setlease, +}; + const struct file_operations cifs_file_direct_nobrl_ops = { /* no mmap, no aio, no readv - BB reevaluate whether they can be done with directio, no cache */ diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 897b2b2b28b..f23206d4653 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); extern int cifs_revalidate_file(struct file *filp); extern int cifs_revalidate_dentry(struct dentry *); +extern void cifs_invalidate_mapping(struct inode *inode); extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int cifs_setattr(struct dentry *, struct iattr *); @@ -72,19 +73,25 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations; /* Functions related to files and directories */ extern const struct file_operations cifs_file_ops; extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ -extern const struct file_operations cifs_file_nobrl_ops; -extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */ +extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */ +extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */ +extern const struct file_operations cifs_file_direct_nobrl_ops; +extern const struct file_operations cifs_file_strict_nobrl_ops; extern int cifs_open(struct inode *inode, struct file *file); extern int cifs_close(struct inode *inode, struct file *file); extern int cifs_closedir(struct inode *inode, struct file *file); extern ssize_t cifs_user_read(struct file *file, char __user *read_data, - size_t read_size, loff_t *poffset); + size_t read_size, loff_t *poffset); +extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, int); +extern int cifs_strict_fsync(struct file *, int); extern int cifs_flush(struct file *, fl_owner_t id); extern int cifs_file_mmap(struct file * , struct vm_area_struct *); +extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); @@ -93,6 +100,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); extern const struct dentry_operations cifs_dentry_ops; extern const struct dentry_operations cifs_ci_dentry_ops; +#ifdef CONFIG_CIFS_DFS_UPCALL +extern struct vfsmount *cifs_dfs_d_automount(struct path *path); +#else +#define cifs_dfs_d_automount NULL +#endif + /* Functions related to symlinks */ extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); extern void cifs_put_link(struct dentry *direntry, @@ -112,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.68" +#define CIFS_VERSION "1.69" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7136c0c3e2f..edd5b29b53c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -161,35 +161,27 @@ struct TCP_Server_Info { int srv_count; /* reference counter */ /* 15 character server name + 0x20 16th byte indicating type = srv */ char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; + enum statusEnum tcpStatus; /* what we think the status is */ char *hostname; /* hostname portion of UNC string */ struct socket *ssocket; - union { - struct sockaddr_in sockAddr; - struct sockaddr_in6 sockAddr6; - } addr; + struct sockaddr_storage dstaddr; struct sockaddr_storage srcaddr; /* locally bind to this IP */ +#ifdef CONFIG_NET_NS + struct net *net; +#endif wait_queue_head_t response_q; wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ struct list_head pending_mid_q; - void *Server_NlsInfo; /* BB - placeholder for future NLS info */ - unsigned short server_codepage; /* codepage for the server */ - enum protocolEnum protocolType; - char versionMajor; - char versionMinor; - bool svlocal:1; /* local server or remote */ bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ bool tcp_nodelay; atomic_t inFlight; /* number of requests on the wire to server */ -#ifdef CONFIG_CIFS_STATS2 - atomic_t inSend; /* requests trying to send */ - atomic_t num_waiters; /* blocked waiting to get in sendrecv */ -#endif - enum statusEnum tcpStatus; /* what we think the status is */ struct mutex srv_mutex; struct task_struct *tsk; char server_GUID[16]; char secMode; + bool session_estab; /* mark when very first sess is established */ + u16 dialect; /* dialect index that server chose */ enum securityEnum secType; unsigned int maxReq; /* Clients should submit no more */ /* than maxReq distinct unanswered SMBs to the server when using */ @@ -202,31 +194,62 @@ struct TCP_Server_Info { unsigned int max_vcs; /* maximum number of smb sessions, at least those that can be specified uniquely with vcnumbers */ - char sessid[4]; /* unique token id for this session */ - /* (returned on Negotiate */ int capabilities; /* allow selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ __u16 CurrentMid; /* multiplex id - rotating counter */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; - __u32 sequence_number; /* needed for CIFS PDU signature */ + __u32 sequence_number; /* for signing, protected by srv_mutex */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ - u16 dialect; /* dialect index that server chose */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ /* extended security flavors that server supports */ + bool sec_ntlmssp; /* supports NTLMSSP */ + bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ - bool sec_kerberosu2u; /* supports U2U Kerberos */ - bool sec_ntlmssp; /* supports NTLMSSP */ - bool session_estab; /* mark when very first sess is established */ + struct delayed_work echo; /* echo ping workqueue job */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif +#ifdef CONFIG_CIFS_STATS2 + atomic_t inSend; /* requests trying to send */ + atomic_t num_waiters; /* blocked waiting to get in sendrecv */ +#endif }; /* + * Macros to allow the TCP_Server_Info->net field and related code to drop out + * when CONFIG_NET_NS isn't set. + */ + +#ifdef CONFIG_NET_NS + +static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv) +{ + return srv->net; +} + +static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) +{ + srv->net = net; +} + +#else + +static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv) +{ + return &init_net; +} + +static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) +{ +} + +#endif + +/* * Session structure. One of these for each uid session with a particular host */ struct cifsSesInfo { @@ -449,13 +472,14 @@ struct cifsInodeInfo { /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ - unsigned long time; /* jiffies of last update/check of inode */ - bool clientCanCacheRead:1; /* read oplock */ - bool clientCanCacheAll:1; /* read and writebehind oplock */ - bool delete_pending:1; /* DELETE_ON_CLOSE is set */ - bool invalid_mapping:1; /* pagecache is invalid */ + bool clientCanCacheRead; /* read oplock */ + bool clientCanCacheAll; /* read and writebehind oplock */ + bool delete_pending; /* DELETE_ON_CLOSE is set */ + bool invalid_mapping; /* pagecache is invalid */ + unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server */ u64 uniqueid; /* server inode number */ + u64 createtime; /* creation time on server */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; #endif @@ -510,6 +534,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon, #endif +struct mid_q_entry; + +/* + * This is the prototype for the mid callback function. When creating one, + * take special care to avoid deadlocks. Things to bear in mind: + * + * - it will be called by cifsd + * - the GlobalMid_Lock will be held + * - the mid will be removed from the pending_mid_q list + */ +typedef void (mid_callback_t)(struct mid_q_entry *mid); + /* one of these for every pending CIFS request to the server */ struct mid_q_entry { struct list_head qhead; /* mids waiting on reply from this server */ @@ -521,7 +557,8 @@ struct mid_q_entry { unsigned long when_sent; /* time when smb send finished */ unsigned long when_received; /* when demux complete (taken off wire) */ #endif - struct task_struct *tsk; /* task waiting for response */ + mid_callback_t *callback; /* call completion callback */ + void *callback_data; /* general purpose pointer for callback */ struct smb_hdr *resp_buf; /* response buffer */ int midState; /* wish this were enum but can not pass to wait_event */ __u8 command; /* smb command code */ @@ -576,6 +613,7 @@ struct cifs_fattr { u64 cf_uniqueid; u64 cf_eof; u64 cf_bytes; + u64 cf_createtime; uid_t cf_uid; gid_t cf_gid; umode_t cf_mode; @@ -623,12 +661,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define CIFS_IOVEC 4 /* array of response buffers */ /* Type of Request to SendReceive2 */ -#define CIFS_STD_OP 0 /* normal request timeout */ -#define CIFS_LONG_OP 1 /* long op (up to 45 sec, oplock time) */ -#define CIFS_VLONG_OP 2 /* sloow op - can take up to 180 seconds */ -#define CIFS_BLOCKING_OP 4 /* operation can block */ -#define CIFS_ASYNC_OP 8 /* do not wait for response */ -#define CIFS_TIMEOUT_MASK 0x00F /* only one of 5 above set in req */ +#define CIFS_BLOCKING_OP 1 /* operation can block */ +#define CIFS_ASYNC_OP 2 /* do not wait for response */ +#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */ #define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ #define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ #define CIFS_NO_RESP 0x040 /* no response buffer required */ @@ -791,6 +826,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +/* reconnect after this many failed echo attempts */ +GLOBAL_EXTERN unsigned short echo_retries; + void cifs_oplock_break(struct work_struct *work); void cifs_oplock_break_get(struct cifsFileInfo *cfile); void cifs_oplock_break_put(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index de36b09763a..b5c8cc5d7a7 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -23,6 +23,7 @@ #define _CIFSPDU_H #include <net/sock.h> +#include <asm/unaligned.h> #include "smbfsctl.h" #ifdef CONFIG_CIFS_WEAK_PW_HASH @@ -50,6 +51,7 @@ #define SMB_COM_SETATTR 0x09 /* trivial response */ #define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ #define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ +#define SMB_COM_ECHO 0x2B /* echo request */ #define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ #define SMB_COM_READ_ANDX 0x2E #define SMB_COM_WRITE_ANDX 0x2F @@ -425,11 +427,49 @@ struct smb_hdr { __u16 Mid; __u8 WordCount; } __attribute__((packed)); -/* given a pointer to an smb_hdr retrieve the value of byte count */ -#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) -#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) + +/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */ +#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \ + (2 * (smb_var)->WordCount)) + /* given a pointer to an smb_hdr retrieve the pointer to the byte area */ -#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2) +#define pByteArea(smb_var) (BCC(smb_var) + 2) + +/* get the converted ByteCount for a SMB packet and return it */ +static inline __u16 +get_bcc(struct smb_hdr *hdr) +{ + __u16 *bc_ptr = (__u16 *)BCC(hdr); + + return get_unaligned(bc_ptr); +} + +/* get the unconverted ByteCount for a SMB packet and return it */ +static inline __u16 +get_bcc_le(struct smb_hdr *hdr) +{ + __le16 *bc_ptr = (__le16 *)BCC(hdr); + + return get_unaligned_le16(bc_ptr); +} + +/* set the ByteCount for a SMB packet in host-byte order */ +static inline void +put_bcc(__u16 count, struct smb_hdr *hdr) +{ + __u16 *bc_ptr = (__u16 *)BCC(hdr); + + put_unaligned(count, bc_ptr); +} + +/* set the ByteCount for a SMB packet in little-endian */ +static inline void +put_bcc_le(__u16 count, struct smb_hdr *hdr) +{ + __le16 *bc_ptr = (__le16 *)BCC(hdr); + + put_unaligned_le16(count, bc_ptr); +} /* * Computer Name Length (since Netbios name was length 16 with last byte 0x20) @@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext { * */ +typedef struct smb_com_echo_req { + struct smb_hdr hdr; + __le16 EchoCount; + __le16 ByteCount; + char Data[1]; +} __attribute__((packed)) ECHO_REQ; + +typedef struct smb_com_echo_rsp { + struct smb_hdr hdr; + __le16 SequenceNumber; + __le16 ByteCount; + char Data[1]; +} __attribute__((packed)) ECHO_RSP; + typedef struct smb_com_logoff_andx_req { struct smb_hdr hdr; /* wct = 2 */ __u8 AndXCommand; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e6d1481b16c..982895fa761 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -61,6 +61,12 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, char **devname); /* extern void renew_parental_timestamps(struct dentry *direntry);*/ +extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, + struct TCP_Server_Info *server); +extern void DeleteMidQEntry(struct mid_q_entry *midEntry); +extern int cifs_call_async(struct TCP_Server_Info *server, + struct smb_hdr *in_buf, mid_callback_t *callback, + void *cbdata); extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , @@ -347,12 +353,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const __u16 netfid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, - const bool waitFlag); + const bool waitFlag, const __u8 oplock_level); extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const int get_flag, const __u64 len, struct file_lock *, const __u16 lock_type, const bool waitFlag); extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); +extern int CIFSSMBEcho(struct TCP_Server_Info *server); extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); extern struct cifsSesInfo *sesInfoAlloc(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 67acfb3acad..3106f5e5c63 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -331,37 +331,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon, static int validate_t2(struct smb_t2_rsp *pSMB) { - int rc = -EINVAL; - int total_size; - char *pBCC; + unsigned int total_size; + + /* check for plausible wct */ + if (pSMB->hdr.WordCount < 10) + goto vt2_err; - /* check for plausible wct, bcc and t2 data and parm sizes */ /* check for parm and data offset going beyond end of smb */ - if (pSMB->hdr.WordCount >= 10) { - if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) && - (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) { - /* check that bcc is at least as big as parms + data */ - /* check that bcc is less than negotiated smb buffer */ - total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount); - if (total_size < 512) { - total_size += - le16_to_cpu(pSMB->t2_rsp.DataCount); - /* BCC le converted in SendReceive */ - pBCC = (pSMB->hdr.WordCount * 2) + - sizeof(struct smb_hdr) + - (char *)pSMB; - if ((total_size <= (*(u16 *)pBCC)) && - (total_size < - CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) { - return 0; - } - } - } - } + if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 || + get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024) + goto vt2_err; + + /* check that bcc is at least as big as parms + data */ + /* check that bcc is less than negotiated smb buffer */ + total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount); + if (total_size >= 512) + goto vt2_err; + + total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount); + if (total_size > get_bcc(&pSMB->hdr) || + total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) + goto vt2_err; + + return 0; +vt2_err: cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, sizeof(struct smb_t2_rsp) + 16); - return rc; + return -EINVAL; } + int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) { @@ -401,15 +399,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { cFYI(1, "Kerberos only mechanism, enable extended security"); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - } -#ifdef CONFIG_CIFS_EXPERIMENTAL - else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) + } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { cFYI(1, "NTLMSSP only mechanism, enable extended security"); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; } -#endif count = 0; for (i = 0; i < CIFS_NUM_PROT; i++) { @@ -455,7 +450,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); - GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { @@ -569,7 +563,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); - GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); server->capabilities = le32_to_cpu(pSMBr->Capabilities); server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; @@ -709,6 +702,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon) return rc; } +/* + * This is a no-op for now. We're not really interested in the reply, but + * rather in the fact that the server sent one and that server->lstrp + * gets updated. + * + * FIXME: maybe we should consider checking that the reply matches request? + */ +static void +cifs_echo_callback(struct mid_q_entry *mid) +{ + struct TCP_Server_Info *server = mid->callback_data; + + DeleteMidQEntry(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); +} + +int +CIFSSMBEcho(struct TCP_Server_Info *server) +{ + ECHO_REQ *smb; + int rc = 0; + + cFYI(1, "In echo request"); + + rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb); + if (rc) + return rc; + + /* set up echo request */ + smb->hdr.Tid = cpu_to_le16(0xffff); + smb->hdr.WordCount = 1; + put_unaligned_le16(1, &smb->EchoCount); + put_bcc_le(1, &smb->hdr); + smb->Data[0] = 'a'; + smb->hdr.smb_buf_length += 3; + + rc = cifs_call_async(server, (struct smb_hdr *)smb, + cifs_echo_callback, server); + if (rc) + cFYI(1, "Echo request failed: %d", rc); + + cifs_small_buf_release(smb); + + return rc; +} + int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) { @@ -1196,7 +1236,7 @@ OldOpenRetry: pSMB->ByteCount = cpu_to_le16(count); /* long_op set to 1 to allow for oplock break timeouts */ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); + (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->num_opens); if (rc) { cFYI(1, "Error in Open = %d", rc); @@ -1309,7 +1349,7 @@ openRetry: pSMB->ByteCount = cpu_to_le16(count); /* long_op set to 1 to allow for oplock break timeouts */ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); + (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->num_opens); if (rc) { cFYI(1, "Error in Open = %d", rc); @@ -1391,7 +1431,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid, iov[0].iov_base = (char *)pSMB; iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, - &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR); + &resp_buf_type, CIFS_LOG_ERROR); cifs_stats_inc(&tcon->num_reads); pSMBr = (READ_RSP *)iov[0].iov_base; if (rc) { @@ -1666,7 +1706,8 @@ int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const __u64 len, const __u64 offset, const __u32 numUnlock, - const __u32 numLock, const __u8 lockType, const bool waitFlag) + const __u32 numLock, const __u8 lockType, + const bool waitFlag, const __u8 oplock_level) { int rc = 0; LOCK_REQ *pSMB = NULL; @@ -1694,6 +1735,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, pSMB->NumberOfLocks = cpu_to_le16(numLock); pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); pSMB->LockType = lockType; + pSMB->OplockLevel = oplock_level; pSMB->AndXCommand = 0xFF; /* none */ pSMB->Fid = smb_file_id; /* netfid stays le */ @@ -3090,7 +3132,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, - CIFS_STD_OP); + 0); cifs_stats_inc(&tcon->num_acl_get); if (rc) { cFYI(1, "Send error in QuerySecDesc = %d", rc); @@ -5565,7 +5607,7 @@ QAllEAsRetry: } /* make sure list_len doesn't go past end of SMB */ - end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); + end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr); if ((char *)ea_response_data + list_len > end_of_smb) { cFYI(1, "EA list appears to go beyond SMB"); rc = -EIO; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index cc1a8604a79..0cc3b81c2e8 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -52,6 +52,9 @@ #define CIFS_PORT 445 #define RFC1001_PORT 139 +/* SMB echo "timeout" -- FIXME: tunable? */ +#define SMB_ECHO_INTERVAL (60 * HZ) + extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); @@ -64,8 +67,8 @@ struct smb_vol { char *UNC; char *UNCip; char *iocharset; /* local code page for mapping to and from Unicode */ - char source_rfc1001_name[16]; /* netbios name of client */ - char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ + char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ + char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; @@ -115,8 +118,8 @@ struct smb_vol { #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) -static int ipv4_connect(struct TCP_Server_Info *server); -static int ipv6_connect(struct TCP_Server_Info *server); +static int ip_connect(struct TCP_Server_Info *server); +static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); @@ -152,6 +155,7 @@ cifs_reconnect(struct TCP_Server_Info *server) /* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they are not used until reconnected */ + cFYI(1, "%s: marking sessions and tcons for reconnect", __func__); spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); @@ -163,7 +167,9 @@ cifs_reconnect(struct TCP_Server_Info *server) } } spin_unlock(&cifs_tcp_ses_lock); + /* do not want to be sending data on a socket we are freeing */ + cFYI(1, "%s: tearing down socket", __func__); mutex_lock(&server->srv_mutex); if (server->ssocket) { cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, @@ -180,30 +186,27 @@ cifs_reconnect(struct TCP_Server_Info *server) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; + server->lstrp = jiffies; + mutex_unlock(&server->srv_mutex); + /* mark submitted MIDs for retry and issue callback */ + cFYI(1, "%s: issuing mid callbacks", __func__); spin_lock(&GlobalMid_Lock); - list_for_each(tmp, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct - mid_q_entry, - qhead); - if (mid_entry->midState == MID_REQUEST_SUBMITTED) { - /* Mark other intransit requests as needing - retry so we do not immediately mark the - session bad again (ie after we reconnect - below) as they timeout too */ + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + if (mid_entry->midState == MID_REQUEST_SUBMITTED) mid_entry->midState = MID_RETRY_NEEDED; - } + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); } spin_unlock(&GlobalMid_Lock); - mutex_unlock(&server->srv_mutex); while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood)) { try_to_freeze(); - if (server->addr.sockAddr6.sin6_family == AF_INET6) - rc = ipv6_connect(server); - else - rc = ipv4_connect(server); + + /* we should try only the port we connected to before */ + rc = generic_ip_connect(server); if (rc) { cFYI(1, "reconnect error %d", rc); msleep(3000); @@ -213,10 +216,9 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsGood; spin_unlock(&GlobalMid_Lock); - /* atomic_set(&server->inFlight,0);*/ - wake_up(&server->response_q); } } + return rc; } @@ -230,9 +232,8 @@ cifs_reconnect(struct TCP_Server_Info *server) static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) { struct smb_t2_rsp *pSMBt; - int total_data_size; - int data_in_this_rsp; int remaining; + __u16 total_data_size, data_in_this_rsp; if (pSMB->Command != SMB_COM_TRANSACTION2) return 0; @@ -246,8 +247,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) pSMBt = (struct smb_t2_rsp *)pSMB; - total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); - data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount); + total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); + data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); remaining = total_data_size - data_in_this_rsp; @@ -273,21 +274,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) { struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; - int total_data_size; - int total_in_buf; - int remaining; - int total_in_buf2; char *data_area_of_target; char *data_area_of_buf2; - __u16 byte_count; + int remaining; + __u16 byte_count, total_data_size, total_in_buf, total_in_buf2; - total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); + total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); - if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { + if (total_data_size != + get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount)) cFYI(1, "total data size of primary and secondary t2 differ"); - } - total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); + total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); remaining = total_data_size - total_in_buf; @@ -297,28 +295,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) if (remaining == 0) /* nothing to do, ignore */ return 0; - total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); + total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount); if (remaining < total_in_buf2) { cFYI(1, "transact2 2nd response contains too much data"); } /* find end of first SMB data area */ data_area_of_target = (char *)&pSMBt->hdr.Protocol + - le16_to_cpu(pSMBt->t2_rsp.DataOffset); + get_unaligned_le16(&pSMBt->t2_rsp.DataOffset); /* validate target area */ - data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol + - le16_to_cpu(pSMB2->t2_rsp.DataOffset); + data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol + + get_unaligned_le16(&pSMB2->t2_rsp.DataOffset); data_area_of_target += total_in_buf; /* copy second buffer into end of first buffer */ memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); total_in_buf += total_in_buf2; - pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf); - byte_count = le16_to_cpu(BCC_LE(pTargetSMB)); + put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); + byte_count = get_bcc_le(pTargetSMB); byte_count += total_in_buf2; - BCC_LE(pTargetSMB) = cpu_to_le16(byte_count); + put_bcc_le(byte_count, pTargetSMB); byte_count = pTargetSMB->smb_buf_length; byte_count += total_in_buf2; @@ -332,7 +330,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) return 0; /* we are done */ } else /* more responses to go */ return 1; +} + +static void +cifs_echo_request(struct work_struct *work) +{ + int rc; + struct TCP_Server_Info *server = container_of(work, + struct TCP_Server_Info, echo.work); + /* no need to ping if we got a response recently */ + if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + goto requeue_echo; + + rc = CIFSSMBEcho(server); + if (rc) + cFYI(1, "Unable to send echo request to server: %s", + server->hostname); + +requeue_echo: + queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); } static int @@ -346,8 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) struct msghdr smb_msg; struct kvec iov; struct socket *csocket = server->ssocket; - struct list_head *tmp; - struct cifsSesInfo *ses; + struct list_head *tmp, *tmp2; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; char temp; @@ -400,7 +416,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; pdu_length = 4; /* enough to get RFC1001 header */ + incomplete_rcv: + if (echo_retries > 0 && + time_after(jiffies, server->lstrp + + (echo_retries * SMB_ECHO_INTERVAL))) { + cERROR(1, "Server %s has not responded in %d seconds. " + "Reconnecting...", server->hostname, + (echo_retries * SMB_ECHO_INTERVAL / HZ)); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; + } + length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, pdu_length, 0 /* BB other flags? */); @@ -477,7 +506,7 @@ incomplete_rcv: * initialize frame) */ cifs_set_port((struct sockaddr *) - &server->addr.sockAddr, CIFS_PORT); + &server->dstaddr, CIFS_PORT); cifs_reconnect(server); csocket = server->ssocket; wake_up(&server->response_q); @@ -560,10 +589,11 @@ incomplete_rcv: continue; } + mid_entry = NULL; + server->lstrp = jiffies; - task_to_wake = NULL; spin_lock(&GlobalMid_Lock); - list_for_each(tmp, &server->pending_mid_q) { + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); if ((mid_entry->mid == smb_buffer->Mid) && @@ -604,20 +634,19 @@ incomplete_rcv: mid_entry->resp_buf = smb_buffer; mid_entry->largeBuf = isLargeBuf; multi_t2_fnd: - task_to_wake = mid_entry->tsk; mid_entry->midState = MID_RESPONSE_RECEIVED; + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); #ifdef CONFIG_CIFS_STATS2 mid_entry->when_received = jiffies; #endif - /* so we do not time out requests to server - which is still responding (since server could - be busy but not dead) */ - server->lstrp = jiffies; break; } + mid_entry = NULL; } spin_unlock(&GlobalMid_Lock); - if (task_to_wake) { + + if (mid_entry != NULL) { /* Was previous buf put in mpx struct for multi-rsp? */ if (!isMultiRsp) { /* smb buffer will be freed by user thread */ @@ -626,11 +655,10 @@ multi_t2_fnd: else smallbuf = NULL; } - wake_up_process(task_to_wake); } else if (!is_valid_oplock_break(smb_buffer, server) && !isMultiRsp) { cERROR(1, "No task to wake, unknown frame received! " - "NumMids %d", midCount.counter); + "NumMids %d", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", (char *)smb_buffer, sizeof(struct smb_hdr)); #ifdef CONFIG_CIFS_DEBUG2 @@ -678,44 +706,16 @@ multi_t2_fnd: if (smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(smallbuf); - /* - * BB: we shouldn't have to do any of this. It shouldn't be - * possible to exit from the thread with active SMB sessions - */ - spin_lock(&cifs_tcp_ses_lock); - if (list_empty(&server->pending_mid_q)) { - /* loop through server session structures attached to this and - mark them dead */ - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifsSesInfo, - smb_ses_list); - ses->status = CifsExiting; - ses->server = NULL; - } - spin_unlock(&cifs_tcp_ses_lock); - } else { - /* although we can not zero the server struct pointer yet, - since there are active requests which may depnd on them, - mark the corresponding SMB sessions as exiting too */ - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifsSesInfo, - smb_ses_list); - ses->status = CifsExiting; - } - + if (!list_empty(&server->pending_mid_q)) { spin_lock(&GlobalMid_Lock); - list_for_each(tmp, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - if (mid_entry->midState == MID_REQUEST_SUBMITTED) { - cFYI(1, "Clearing Mid 0x%x - waking up ", + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cFYI(1, "Clearing Mid 0x%x - issuing callback", mid_entry->mid); - task_to_wake = mid_entry->tsk; - if (task_to_wake) - wake_up_process(task_to_wake); - } + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); } spin_unlock(&GlobalMid_Lock); - spin_unlock(&cifs_tcp_ses_lock); /* 1/8th of sec is more than enough time for them to exit */ msleep(125); } @@ -733,18 +733,6 @@ multi_t2_fnd: coming home not much else we can do but free the memory */ } - /* last chance to mark ses pointers invalid - if there are any pointing to this (e.g - if a crazy root user tried to kill cifsd - kernel thread explicitly this might happen) */ - /* BB: This shouldn't be necessary, see above */ - spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); - ses->server = NULL; - } - spin_unlock(&cifs_tcp_ses_lock); - kfree(server->hostname); task_to_wake = xchg(&server->tsk, NULL); kfree(server); @@ -817,11 +805,11 @@ cifs_parse_mount_options(char *options, const char *devname, * informational, only used for servers that do not support * port 445 and it can be overridden at mount time */ - memset(vol->source_rfc1001_name, 0x20, 15); - for (i = 0; i < strnlen(nodename, 15); i++) + memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); + for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) vol->source_rfc1001_name[i] = toupper(nodename[i]); - vol->source_rfc1001_name[15] = 0; + vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0; /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ vol->target_rfc1001_name[0] = 0; @@ -985,13 +973,11 @@ cifs_parse_mount_options(char *options, const char *devname, return 1; } else if (strnicmp(value, "krb5", 4) == 0) { vol->secFlg |= CIFSSEC_MAY_KRB5; -#ifdef CONFIG_CIFS_EXPERIMENTAL } else if (strnicmp(value, "ntlmsspi", 8) == 0) { vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN; } else if (strnicmp(value, "ntlmssp", 7) == 0) { vol->secFlg |= CIFSSEC_MAY_NTLMSSP; -#endif } else if (strnicmp(value, "ntlmv2i", 7) == 0) { vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN; @@ -1116,6 +1102,8 @@ cifs_parse_mount_options(char *options, const char *devname, } else if (!strnicmp(data, "uid", 3) && value && *value) { vol->linux_uid = simple_strtoul(value, &value, 0); uid_specified = true; + } else if (!strnicmp(data, "cruid", 5) && value && *value) { + vol->cred_uid = simple_strtoul(value, &value, 0); } else if (!strnicmp(data, "forceuid", 8)) { override_uid = 1; } else if (!strnicmp(data, "noforceuid", 10)) { @@ -1168,22 +1156,22 @@ cifs_parse_mount_options(char *options, const char *devname, if (!value || !*value || (*value == ' ')) { cFYI(1, "invalid (empty) netbiosname"); } else { - memset(vol->source_rfc1001_name, 0x20, 15); - for (i = 0; i < 15; i++) { - /* BB are there cases in which a comma can be - valid in this workstation netbios name (and need - special handling)? */ - - /* We do not uppercase netbiosname for user */ + memset(vol->source_rfc1001_name, 0x20, + RFC1001_NAME_LEN); + /* + * FIXME: are there cases in which a comma can + * be valid in workstation netbios name (and + * need special handling)? + */ + for (i = 0; i < RFC1001_NAME_LEN; i++) { + /* don't ucase netbiosname for user */ if (value[i] == 0) break; - else - vol->source_rfc1001_name[i] = - value[i]; + vol->source_rfc1001_name[i] = value[i]; } /* The string has 16th byte zero still from set at top of the function */ - if ((i == 15) && (value[i] != 0)) + if (i == RFC1001_NAME_LEN && value[i] != 0) printk(KERN_WARNING "CIFS: netbiosname" " longer than 15 truncated.\n"); } @@ -1193,7 +1181,8 @@ cifs_parse_mount_options(char *options, const char *devname, cFYI(1, "empty server netbiosname specified"); } else { /* last byte, type, is 0x20 for servr type */ - memset(vol->target_rfc1001_name, 0x20, 16); + memset(vol->target_rfc1001_name, 0x20, + RFC1001_NAME_LEN_WITH_NULL); for (i = 0; i < 15; i++) { /* BB are there cases in which a comma can be @@ -1210,7 +1199,7 @@ cifs_parse_mount_options(char *options, const char *devname, } /* The string has 16th byte zero still from set at top of the function */ - if ((i == 15) && (value[i] != 0)) + if (i == RFC1001_NAME_LEN && value[i] != 0) printk(KERN_WARNING "CIFS: server net" "biosname longer than 15 truncated.\n"); } @@ -1341,10 +1330,8 @@ cifs_parse_mount_options(char *options, const char *devname, vol->no_psx_acl = 0; } else if (strnicmp(data, "noacl", 5) == 0) { vol->no_psx_acl = 1; -#ifdef CONFIG_CIFS_EXPERIMENTAL } else if (strnicmp(data, "locallease", 6) == 0) { vol->local_lease = 1; -#endif } else if (strnicmp(data, "sign", 4) == 0) { vol->secFlg |= CIFSSEC_MUST_SIGN; } else if (strnicmp(data, "seal", 4) == 0) { @@ -1454,35 +1441,71 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) } } +/* + * If no port is specified in addr structure, we try to match with 445 port + * and if it fails - with 139 ports. It should be called only if address + * families of server and addr are equal. + */ +static bool +match_port(struct TCP_Server_Info *server, struct sockaddr *addr) +{ + unsigned short int port, *sport; + + switch (addr->sa_family) { + case AF_INET: + sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port; + port = ((struct sockaddr_in *) addr)->sin_port; + break; + case AF_INET6: + sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port; + port = ((struct sockaddr_in6 *) addr)->sin6_port; + break; + default: + WARN_ON(1); + return false; + } + + if (!port) { + port = htons(CIFS_PORT); + if (port == *sport) + return true; + + port = htons(RFC1001_PORT); + } + + return port == *sport; +} static bool match_address(struct TCP_Server_Info *server, struct sockaddr *addr, struct sockaddr *srcaddr) { - struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; - switch (addr->sa_family) { - case AF_INET: - if (addr4->sin_addr.s_addr != - server->addr.sockAddr.sin_addr.s_addr) - return false; - if (addr4->sin_port && - addr4->sin_port != server->addr.sockAddr.sin_port) + case AF_INET: { + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + struct sockaddr_in *srv_addr4 = + (struct sockaddr_in *)&server->dstaddr; + + if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr) return false; break; - case AF_INET6: + } + case AF_INET6: { + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + struct sockaddr_in6 *srv_addr6 = + (struct sockaddr_in6 *)&server->dstaddr; + if (!ipv6_addr_equal(&addr6->sin6_addr, - &server->addr.sockAddr6.sin6_addr)) - return false; - if (addr6->sin6_scope_id != - server->addr.sockAddr6.sin6_scope_id) + &srv_addr6->sin6_addr)) return false; - if (addr6->sin6_port && - addr6->sin6_port != server->addr.sockAddr6.sin6_port) + if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id) return false; break; } + default: + WARN_ON(1); + return false; /* don't expect to be here */ + } if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr)) return false; @@ -1545,10 +1568,16 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) + continue; + if (!match_address(server, addr, (struct sockaddr *)&vol->srcaddr)) continue; + if (!match_port(server, addr)) + continue; + if (!match_security(server, vol)) continue; @@ -1572,9 +1601,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) return; } + put_net(cifs_net_ns(server)); + list_del_init(&server->tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); + cancel_delayed_work_sync(&server->echo); + spin_lock(&GlobalMid_Lock); server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); @@ -1644,6 +1677,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) goto out_err; } + cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->hostname = extract_hostname(volume_info->UNC); if (IS_ERR(tcp_ses->hostname)) { rc = PTR_ERR(tcp_ses->hostname); @@ -1664,8 +1698,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; + tcp_ses->lstrp = jiffies; INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); + INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); /* * at this point we are the only ones with the pointer @@ -1681,14 +1717,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "attempting ipv6 connect"); /* BB should we allow ipv6 on port 139? */ /* other OS never observed in Wild doing 139 with v6 */ - memcpy(&tcp_ses->addr.sockAddr6, sin_server6, - sizeof(struct sockaddr_in6)); - rc = ipv6_connect(tcp_ses); - } else { - memcpy(&tcp_ses->addr.sockAddr, sin_server, - sizeof(struct sockaddr_in)); - rc = ipv4_connect(tcp_ses); - } + memcpy(&tcp_ses->dstaddr, sin_server6, + sizeof(struct sockaddr_in6)); + } else + memcpy(&tcp_ses->dstaddr, sin_server, + sizeof(struct sockaddr_in)); + + rc = ip_connect(tcp_ses); if (rc < 0) { cERROR(1, "Error connecting to socket. Aborting operation"); goto out_err_crypto_release; @@ -1715,11 +1750,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cifs_fscache_get_client_cookie(tcp_ses); + /* queue echo request delayed work */ + queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + return tcp_ses; out_err_crypto_release: cifs_crypto_shash_release(tcp_ses); + put_net(cifs_net_ns(tcp_ses)); + out_err: if (tcp_ses) { if (!IS_ERR(tcp_ses->hostname)) @@ -1793,6 +1833,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) { int rc = -ENOMEM, xid; struct cifsSesInfo *ses; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; xid = GetXid(); @@ -1836,12 +1878,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) /* new SMB session uses our server ref */ ses->server = server; - if (server->addr.sockAddr6.sin6_family == AF_INET6) - sprintf(ses->serverName, "%pI6", - &server->addr.sockAddr6.sin6_addr); + if (server->dstaddr.ss_family == AF_INET6) + sprintf(ses->serverName, "%pI6", &addr6->sin6_addr); else - sprintf(ses->serverName, "%pI4", - &server->addr.sockAddr.sin_addr.s_addr); + sprintf(ses->serverName, "%pI4", &addr->sin_addr); if (volume_info->username) strncpy(ses->userName, volume_info->username, @@ -2136,19 +2176,106 @@ bind_socket(struct TCP_Server_Info *server) } static int -ipv4_connect(struct TCP_Server_Info *server) +ip_rfc1001_connect(struct TCP_Server_Info *server) +{ + int rc = 0; + /* + * some servers require RFC1001 sessinit before sending + * negprot - BB check reconnection in case where second + * sessinit is sent but no second negprot + */ + struct rfc1002_session_packet *ses_init_buf; + struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), + GFP_KERNEL); + if (ses_init_buf) { + ses_init_buf->trailer.session_req.called_len = 32; + + if (server->server_RFC1001_name && + server->server_RFC1001_name[0] != 0) + rfc1002mangle(ses_init_buf->trailer. + session_req.called_name, + server->server_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(ses_init_buf->trailer. + session_req.called_name, + DEFAULT_CIFS_CALLED_NAME, + RFC1001_NAME_LEN_WITH_NULL); + + ses_init_buf->trailer.session_req.calling_len = 32; + + /* + * calling name ends in null (byte 16) from old smb + * convention. + */ + if (server->workstation_RFC1001_name && + server->workstation_RFC1001_name[0] != 0) + rfc1002mangle(ses_init_buf->trailer. + session_req.calling_name, + server->workstation_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(ses_init_buf->trailer. + session_req.calling_name, + "LINUX_CIFS_CLNT", + RFC1001_NAME_LEN_WITH_NULL); + + ses_init_buf->trailer.session_req.scope1 = 0; + ses_init_buf->trailer.session_req.scope2 = 0; + smb_buf = (struct smb_hdr *)ses_init_buf; + + /* sizeof RFC1002_SESSION_REQUEST with no scope */ + smb_buf->smb_buf_length = 0x81000044; + rc = smb_send(server, smb_buf, 0x44); + kfree(ses_init_buf); + /* + * RFC1001 layer in at least one server + * requires very short break before negprot + * presumably because not expecting negprot + * to follow so fast. This is a simple + * solution that works without + * complicating the code and causes no + * significant slowing down on mount + * for everyone else + */ + usleep_range(1000, 2000); + } + /* + * else the negprot may still work without this + * even though malloc failed + */ + + return rc; +} + +static int +generic_ip_connect(struct TCP_Server_Info *server) { int rc = 0; - int val; - bool connected = false; - __be16 orig_port = 0; + unsigned short int sport; + int slen, sfamily; struct socket *socket = server->ssocket; + struct sockaddr *saddr; + + saddr = (struct sockaddr *) &server->dstaddr; + + if (server->dstaddr.ss_family == AF_INET6) { + sport = ((struct sockaddr_in6 *) saddr)->sin6_port; + slen = sizeof(struct sockaddr_in6); + sfamily = AF_INET6; + } else { + sport = ((struct sockaddr_in *) saddr)->sin_port; + slen = sizeof(struct sockaddr_in); + sfamily = AF_INET; + } if (socket == NULL) { - rc = sock_create_kern(PF_INET, SOCK_STREAM, - IPPROTO_TCP, &socket); + rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, + IPPROTO_TCP, &socket, 1); if (rc < 0) { cERROR(1, "Error %d creating socket", rc); + server->ssocket = NULL; return rc; } @@ -2156,63 +2283,28 @@ ipv4_connect(struct TCP_Server_Info *server) cFYI(1, "Socket created"); server->ssocket = socket; socket->sk->sk_allocation = GFP_NOFS; - cifs_reclassify_socket4(socket); + if (sfamily == AF_INET6) + cifs_reclassify_socket6(socket); + else + cifs_reclassify_socket4(socket); } rc = bind_socket(server); if (rc < 0) return rc; - /* user overrode default port */ - if (server->addr.sockAddr.sin_port) { - rc = socket->ops->connect(socket, (struct sockaddr *) - &server->addr.sockAddr, - sizeof(struct sockaddr_in), 0); - if (rc >= 0) - connected = true; - } - - if (!connected) { - /* save original port so we can retry user specified port - later if fall back ports fail this time */ - orig_port = server->addr.sockAddr.sin_port; - - /* do not retry on the same port we just failed on */ - if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) { - server->addr.sockAddr.sin_port = htons(CIFS_PORT); - rc = socket->ops->connect(socket, - (struct sockaddr *) - &server->addr.sockAddr, - sizeof(struct sockaddr_in), 0); - if (rc >= 0) - connected = true; - } - } - if (!connected) { - server->addr.sockAddr.sin_port = htons(RFC1001_PORT); - rc = socket->ops->connect(socket, (struct sockaddr *) - &server->addr.sockAddr, - sizeof(struct sockaddr_in), 0); - if (rc >= 0) - connected = true; - } - - /* give up here - unless we want to retry on different - protocol families some day */ - if (!connected) { - if (orig_port) - server->addr.sockAddr.sin_port = orig_port; - cFYI(1, "Error %d connecting to server via ipv4", rc); + rc = socket->ops->connect(socket, saddr, slen, 0); + if (rc < 0) { + cFYI(1, "Error %d connecting to server", rc); sock_release(socket); server->ssocket = NULL; return rc; } - /* * Eventually check for other socket options to change from - * the default. sock_setsockopt not used because it expects - * user space buffer + * the default. sock_setsockopt not used because it expects + * user space buffer */ socket->sk->sk_rcvtimeo = 7 * HZ; socket->sk->sk_sndtimeo = 5 * HZ; @@ -2226,7 +2318,7 @@ ipv4_connect(struct TCP_Server_Info *server) } if (server->tcp_nodelay) { - val = 1; + int val = 1; rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, (char *)&val, sizeof(val)); if (rc) @@ -2237,161 +2329,39 @@ ipv4_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - /* send RFC1001 sessinit */ - if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { - /* some servers require RFC1001 sessinit before sending - negprot - BB check reconnection in case where second - sessinit is sent but no second negprot */ - struct rfc1002_session_packet *ses_init_buf; - struct smb_hdr *smb_buf; - ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), - GFP_KERNEL); - if (ses_init_buf) { - ses_init_buf->trailer.session_req.called_len = 32; - if (server->server_RFC1001_name && - server->server_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - server->server_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - DEFAULT_CIFS_CALLED_NAME, - RFC1001_NAME_LEN_WITH_NULL); - - ses_init_buf->trailer.session_req.calling_len = 32; - - /* calling name ends in null (byte 16) from old smb - convention. */ - if (server->workstation_RFC1001_name && - server->workstation_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - server->workstation_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - "LINUX_CIFS_CLNT", - RFC1001_NAME_LEN_WITH_NULL); - - ses_init_buf->trailer.session_req.scope1 = 0; - ses_init_buf->trailer.session_req.scope2 = 0; - smb_buf = (struct smb_hdr *)ses_init_buf; - /* sizeof RFC1002_SESSION_REQUEST with no scope */ - smb_buf->smb_buf_length = 0x81000044; - rc = smb_send(server, smb_buf, 0x44); - kfree(ses_init_buf); - msleep(1); /* RFC1001 layer in at least one server - requires very short break before negprot - presumably because not expecting negprot - to follow so fast. This is a simple - solution that works without - complicating the code and causes no - significant slowing down on mount - for everyone else */ - } - /* else the negprot may still work without this - even though malloc failed */ - - } + if (sport == htons(RFC1001_PORT)) + rc = ip_rfc1001_connect(server); return rc; } static int -ipv6_connect(struct TCP_Server_Info *server) +ip_connect(struct TCP_Server_Info *server) { - int rc = 0; - int val; - bool connected = false; - __be16 orig_port = 0; - struct socket *socket = server->ssocket; + unsigned short int *sport; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; - if (socket == NULL) { - rc = sock_create_kern(PF_INET6, SOCK_STREAM, - IPPROTO_TCP, &socket); - if (rc < 0) { - cERROR(1, "Error %d creating ipv6 socket", rc); - socket = NULL; - return rc; - } + if (server->dstaddr.ss_family == AF_INET6) + sport = &addr6->sin6_port; + else + sport = &addr->sin_port; - /* BB other socket options to set KEEPALIVE, NODELAY? */ - cFYI(1, "ipv6 Socket created"); - server->ssocket = socket; - socket->sk->sk_allocation = GFP_NOFS; - cifs_reclassify_socket6(socket); - } + if (*sport == 0) { + int rc; - rc = bind_socket(server); - if (rc < 0) - return rc; + /* try with 445 port at first */ + *sport = htons(CIFS_PORT); - /* user overrode default port */ - if (server->addr.sockAddr6.sin6_port) { - rc = socket->ops->connect(socket, - (struct sockaddr *) &server->addr.sockAddr6, - sizeof(struct sockaddr_in6), 0); + rc = generic_ip_connect(server); if (rc >= 0) - connected = true; - } - - if (!connected) { - /* save original port so we can retry user specified port - later if fall back ports fail this time */ - - orig_port = server->addr.sockAddr6.sin6_port; - /* do not retry on the same port we just failed on */ - if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) { - server->addr.sockAddr6.sin6_port = htons(CIFS_PORT); - rc = socket->ops->connect(socket, (struct sockaddr *) - &server->addr.sockAddr6, - sizeof(struct sockaddr_in6), 0); - if (rc >= 0) - connected = true; - } - } - if (!connected) { - server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT); - rc = socket->ops->connect(socket, (struct sockaddr *) - &server->addr.sockAddr6, - sizeof(struct sockaddr_in6), 0); - if (rc >= 0) - connected = true; - } - - /* give up here - unless we want to retry on different - protocol families some day */ - if (!connected) { - if (orig_port) - server->addr.sockAddr6.sin6_port = orig_port; - cFYI(1, "Error %d connecting to server via ipv6", rc); - sock_release(socket); - server->ssocket = NULL; - return rc; - } - - /* - * Eventually check for other socket options to change from - * the default. sock_setsockopt not used because it expects - * user space buffer - */ - socket->sk->sk_rcvtimeo = 7 * HZ; - socket->sk->sk_sndtimeo = 5 * HZ; + return rc; - if (server->tcp_nodelay) { - val = 1; - rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rc) - cFYI(1, "set TCP_NODELAY socket option error %d", rc); + /* if it failed, try with 139 port */ + *sport = htons(RFC1001_PORT); } - server->ssocket = socket; - - return rc; + return generic_ip_connect(server); } void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon, @@ -2970,8 +2940,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, TCONX_RSP *pSMBr; unsigned char *bcc_ptr; int rc = 0; - int length, bytes_left; - __u16 count; + int length; + __u16 bytes_left, count; if (ses == NULL) return -EIO; @@ -2999,7 +2969,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; /* skip password */ /* already aligned so no need to do it below */ } else { - pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); + pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* BB FIXME add code to fail this if NTLMv2 or Kerberos specified as required (when that support is added to the vfs in the future) as only NTLM or the much @@ -3017,7 +2987,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, #endif /* CIFS_WEAK_PW_HASH */ SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); - bcc_ptr += CIFS_SESS_KEY_SIZE; + bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { /* must align unicode strings */ *bcc_ptr = 0; /* null byte password */ @@ -3055,7 +3025,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, - CIFS_STD_OP); + 0); /* above now done in SendReceive */ if ((rc == 0) && (tcon != NULL)) { @@ -3065,7 +3035,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, tcon->need_reconnect = false; tcon->tid = smb_buffer_response->Tid; bcc_ptr = pByteArea(smb_buffer_response); - bytes_left = BCC(smb_buffer_response); + bytes_left = get_bcc(smb_buffer_response); length = strnlen(bcc_ptr, bytes_left - 2); if (smb_buffer->Flags2 & SMBFLG2_UNICODE) is_unicode = true; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3840eddbfb7..dd5f22918c3 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -130,17 +130,6 @@ cifs_bp_rename_retry: return full_path; } -static void setup_cifs_dentry(struct cifsTconInfo *tcon, - struct dentry *direntry, - struct inode *newinode) -{ - if (tcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; - d_instantiate(direntry, newinode); -} - /* Inode operations in similar order to how they appear in Linux file fs.h */ int @@ -293,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, args.uid = NO_CHANGE_64; args.gid = NO_CHANGE_64; } - CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle, + current->tgid); } else { /* BB implement mode setting via Windows security descriptors e.g. */ @@ -329,7 +316,7 @@ cifs_create_get_file_info: cifs_create_set_dentry: if (rc == 0) - setup_cifs_dentry(tcon, direntry, newinode); + d_instantiate(direntry, newinode); else cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); @@ -420,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; if (rc == 0) d_instantiate(direntry, newinode); @@ -603,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, parent_dir_inode->i_sb, xid, NULL); if ((rc == 0) && (newInode != NULL)) { - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; d_add(direntry, newInode); if (posix_open) { filp = lookup_instantiate_filp(nd, direntry, @@ -633,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } else if (rc == -ENOENT) { rc = 0; direntry->d_time = jiffies; - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; d_add(direntry, NULL); /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ @@ -656,22 +631,37 @@ lookup_out: static int cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { - int isValid = 1; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; if (direntry->d_inode) { if (cifs_revalidate_dentry(direntry)) return 0; - } else { - cFYI(1, "neg dentry 0x%p name = %s", - direntry, direntry->d_name.name); - if (time_after(jiffies, direntry->d_time + HZ) || - !lookupCacheEnabled) { - d_drop(direntry); - isValid = 0; - } + else + return 1; } - return isValid; + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!nd) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + } + + if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + return 0; + + return 1; } /* static int cifs_d_delete(struct dentry *direntry) @@ -685,12 +675,14 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) const struct dentry_operations cifs_dentry_ops = { .d_revalidate = cifs_d_revalidate, + .d_automount = cifs_dfs_d_automount, /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; -static int cifs_ci_hash(struct dentry *dentry, struct qstr *q) +static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *q) { - struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; + struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; unsigned long hash; int i; @@ -703,21 +695,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q) return 0; } -static int cifs_ci_compare(struct dentry *dentry, struct qstr *a, - struct qstr *b) +static int cifs_ci_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; - - if ((a->len == b->len) && - (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) { - /* - * To preserve case, don't let an existing negative dentry's - * case take precedence. If a is not a negative dentry, this - * should have no side effects - */ - memcpy((void *)a->name, b->name, a->len); + struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; + + if ((name->len == len) && + (nls_strnicmp(codepage, name->name, str, len) == 0)) return 0; - } return 1; } @@ -725,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = { .d_revalidate = cifs_d_revalidate, .d_hash = cifs_ci_hash, .d_compare = cifs_ci_compare, + .d_automount = cifs_dfs_d_automount, }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5a28660ca2b..d7d65a70678 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -104,53 +104,6 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OPEN; } -static inline int cifs_open_inode_helper(struct inode *inode, - struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf, - char *full_path, int xid) -{ - struct cifsInodeInfo *pCifsInode = CIFS_I(inode); - struct timespec temp; - int rc; - - if (pCifsInode->clientCanCacheRead) { - /* we have the inode open somewhere else - no need to discard cache data */ - goto client_can_cache; - } - - /* BB need same check in cifs_create too? */ - /* if not oplocked, invalidate inode pages if mtime or file - size changed */ - temp = cifs_NTtimeToUnix(buf->LastWriteTime); - if (timespec_equal(&inode->i_mtime, &temp) && - (inode->i_size == - (loff_t)le64_to_cpu(buf->EndOfFile))) { - cFYI(1, "inode unchanged on server"); - } else { - if (inode->i_mapping) { - /* BB no need to lock inode until after invalidate - since namei code should already have it locked? */ - rc = filemap_write_and_wait(inode->i_mapping); - mapping_set_error(inode->i_mapping, rc); - } - cFYI(1, "invalidating remote inode since open detected it " - "changed"); - invalidate_remote_inode(inode); - } - -client_can_cache: - if (pTcon->unix_ext) - rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, - xid); - else - rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, - xid, NULL); - - cifs_set_oplock_level(pCifsInode, oplock); - - return rc; -} - int cifs_posix_open(char *full_path, struct inode **pinode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *poplock, __u16 *pnetfid, int xid) @@ -213,6 +166,76 @@ posix_open_ret: return rc; } +static int +cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock, + __u16 *pnetfid, int xid) +{ + int rc; + int desiredAccess; + int disposition; + FILE_ALL_INFO *buf; + + desiredAccess = cifs_convert_flags(f_flags); + +/********************************************************************* + * open flag mapping table: + * + * POSIX Flag CIFS Disposition + * ---------- ---------------- + * O_CREAT FILE_OPEN_IF + * O_CREAT | O_EXCL FILE_CREATE + * O_CREAT | O_TRUNC FILE_OVERWRITE_IF + * O_TRUNC FILE_OVERWRITE + * none of the above FILE_OPEN + * + * Note that there is not a direct match between disposition + * FILE_SUPERSEDE (ie create whether or not file exists although + * O_CREAT | O_TRUNC is similar but truncates the existing + * file rather than creating a new file as FILE_SUPERSEDE does + * (which uses the attributes / metadata passed in on open call) + *? + *? O_SYNC is a reasonable match to CIFS writethrough flag + *? and the read write flags match reasonably. O_LARGEFILE + *? is irrelevant because largefile support is always used + *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY, + * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation + *********************************************************************/ + + disposition = cifs_get_disposition(f_flags); + + /* BB pass O_SYNC flag through on file attributes .. BB */ + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBOpen(xid, tcon, full_path, disposition, + desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags + & CIFS_MOUNT_MAP_SPECIAL_CHR); + else + rc = SMBLegacyOpen(xid, tcon, full_path, disposition, + desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags + & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc) + goto out; + + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, + xid); + else + rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, + xid, pnetfid); + +out: + kfree(buf); + return rc; +} + struct cifsFileInfo * cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -264,6 +287,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); @@ -279,6 +303,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); + + /* in strict cache mode we need invalidate mapping on the last + close because it may cause a error when we open this file + again and get at least level II oplock */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + CIFS_I(inode)->invalid_mapping = true; + cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -317,10 +348,8 @@ int cifs_open(struct inode *inode, struct file *file) struct cifsFileInfo *pCifsFile = NULL; struct cifsInodeInfo *pCifsInode; char *full_path = NULL; - int desiredAccess; - int disposition; + bool posix_open_ok = false; __u16 netfid; - FILE_ALL_INFO *buf = NULL; xid = GetXid(); @@ -358,17 +387,7 @@ int cifs_open(struct inode *inode, struct file *file) file->f_flags, &oplock, &netfid, xid); if (rc == 0) { cFYI(1, "posix open succeeded"); - - pCifsFile = cifs_new_fileinfo(netfid, file, tlink, - oplock); - if (pCifsFile == NULL) { - CIFSSMBClose(xid, tcon, netfid); - rc = -ENOMEM; - } - - cifs_fscache_set_inode_cookie(inode, file); - - goto out; + posix_open_ok = true; } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) cERROR(1, "server %s of type %s returned" @@ -385,103 +404,39 @@ int cifs_open(struct inode *inode, struct file *file) or DFS errors */ } - desiredAccess = cifs_convert_flags(file->f_flags); - -/********************************************************************* - * open flag mapping table: - * - * POSIX Flag CIFS Disposition - * ---------- ---------------- - * O_CREAT FILE_OPEN_IF - * O_CREAT | O_EXCL FILE_CREATE - * O_CREAT | O_TRUNC FILE_OVERWRITE_IF - * O_TRUNC FILE_OVERWRITE - * none of the above FILE_OPEN - * - * Note that there is not a direct match between disposition - * FILE_SUPERSEDE (ie create whether or not file exists although - * O_CREAT | O_TRUNC is similar but truncates the existing - * file rather than creating a new file as FILE_SUPERSEDE does - * (which uses the attributes / metadata passed in on open call) - *? - *? O_SYNC is a reasonable match to CIFS writethrough flag - *? and the read write flags match reasonably. O_LARGEFILE - *? is irrelevant because largefile support is always used - *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY, - * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation - *********************************************************************/ - - disposition = cifs_get_disposition(file->f_flags); - - /* BB pass O_SYNC flag through on file attributes .. BB */ - - /* Also refresh inode by passing in file_info buf returned by SMBOpen - and calling get_inode_info with returned buf (at least helps - non-Unix server case) */ - - /* BB we can not do this if this is the second open of a file - and the first handle has writebehind data, we might be - able to simply do a filemap_fdatawrite/filemap_fdatawait first */ - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } - - if (tcon->ses->capabilities & CAP_NT_SMBS) - rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); - else - rc = -EIO; /* no NT SMB support fall into legacy open below */ - - if (rc == -EIO) { - /* Old server, try legacy style OpenX */ - rc = SMBLegacyOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); - } - if (rc) { - cFYI(1, "cifs_open returned 0x%x", rc); - goto out; + if (!posix_open_ok) { + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, + file->f_flags, &oplock, &netfid, xid); + if (rc) + goto out; } - rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid); - if (rc != 0) - goto out; - pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock); if (pCifsFile == NULL) { + CIFSSMBClose(xid, tcon, netfid); rc = -ENOMEM; goto out; } cifs_fscache_set_inode_cookie(inode, file); - if (oplock & CIFS_CREATE_ACTION) { + if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { /* time to set mode which we can not set earlier due to problems creating new read-only files */ - if (tcon->unix_ext) { - struct cifs_unix_set_info_args args = { - .mode = inode->i_mode, - .uid = NO_CHANGE_64, - .gid = NO_CHANGE_64, - .ctime = NO_CHANGE_64, - .atime = NO_CHANGE_64, - .mtime = NO_CHANGE_64, - .device = 0, - }; - CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - } + struct cifs_unix_set_info_args args = { + .mode = inode->i_mode, + .uid = NO_CHANGE_64, + .gid = NO_CHANGE_64, + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = 0, + }; + CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid, + pCifsFile->pid); } out: - kfree(buf); kfree(full_path); FreeXid(xid); cifs_put_tlink(tlink); @@ -779,12 +734,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) /* BB we could chain these into one lock request BB */ rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, - 0, 1, lockType, 0 /* wait flag */ ); + 0, 1, lockType, 0 /* wait flag */, 0); if (rc == 0) { rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 1 /* numUnlock */ , 0 /* numLock */ , lockType, - 0 /* wait flag */ ); + 0 /* wait flag */, 0); pfLock->fl_type = F_UNLCK; if (rc != 0) cERROR(1, "Error unlocking previously locked " @@ -801,13 +756,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 0, 1, lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */); + 0 /* wait flag */, 0); if (rc == 0) { rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 1, 0, lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */); + 0 /* wait flag */, 0); pfLock->fl_type = F_RDLCK; if (rc != 0) cERROR(1, "Error unlocking " @@ -850,8 +805,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) if (numLock) { rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, - 0, numLock, lockType, wait_flag); + pfLock->fl_start, 0, numLock, lockType, + wait_flag, 0); if (rc == 0) { /* For Windows locks we must store them. */ @@ -871,9 +826,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) (pfLock->fl_start + length) >= (li->offset + li->length)) { stored_rc = CIFSSMBLock(xid, tcon, - netfid, - li->length, li->offset, - 1, 0, li->type, false); + netfid, li->length, + li->offset, 1, 0, + li->type, false, 0); if (stored_rc) rc = stored_rc; else { @@ -892,29 +847,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) return rc; } -/* - * Set the timeout on write requests past EOF. For some servers (Windows) - * these calls can be very long. - * - * If we're writing >10M past the EOF we give a 180s timeout. Anything less - * than that gets a 45s timeout. Writes not past EOF get 15s timeouts. - * The 10M cutoff is totally arbitrary. A better scheme for this would be - * welcome if someone wants to suggest one. - * - * We may be able to do a better job with this if there were some way to - * declare that a file should be sparse. - */ -static int -cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset) -{ - if (offset <= cifsi->server_eof) - return CIFS_STD_OP; - else if (offset > (cifsi->server_eof + (10 * 1024 * 1024))) - return CIFS_VLONG_OP; - else - return CIFS_LONG_OP; -} - /* update the file size (if needed) after a write */ static void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, @@ -935,7 +867,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, unsigned int total_written; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - int xid, long_op; + int xid; struct cifsFileInfo *open_file; struct cifsInodeInfo *cifsi = CIFS_I(inode); @@ -956,7 +888,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, xid = GetXid(); - long_op = cifs_write_timeout(cifsi, *poffset); for (total_written = 0; write_size > total_written; total_written += bytes_written) { rc = -EAGAIN; @@ -984,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, min_t(const int, cifs_sb->wsize, write_size - total_written), *poffset, &bytes_written, - NULL, write_data + total_written, long_op); + NULL, write_data + total_written, 0); } if (rc || (bytes_written == 0)) { if (total_written) @@ -997,8 +928,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_update_eof(cifsi, *poffset, bytes_written); *poffset += bytes_written; } - long_op = CIFS_STD_OP; /* subsequent writes fast - - 15 seconds is plenty */ } cifs_stats_bytes_written(pTcon, total_written); @@ -1027,7 +956,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, unsigned int total_written; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - int xid, long_op; + int xid; struct dentry *dentry = open_file->dentry; struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); @@ -1040,7 +969,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, xid = GetXid(); - long_op = cifs_write_timeout(cifsi, *poffset); for (total_written = 0; write_size > total_written; total_written += bytes_written) { rc = -EAGAIN; @@ -1070,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len, *poffset, &bytes_written, - iov, 1, long_op); + iov, 1, 0); } else rc = CIFSSMBWrite(xid, pTcon, open_file->netfid, @@ -1078,7 +1006,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, write_size - total_written), *poffset, &bytes_written, write_data + total_written, - NULL, long_op); + NULL, 0); } if (rc || (bytes_written == 0)) { if (total_written) @@ -1091,8 +1019,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, cifs_update_eof(cifsi, *poffset, bytes_written); *poffset += bytes_written; } - long_op = CIFS_STD_OP; /* subsequent writes fast - - 15 seconds is plenty */ } cifs_stats_bytes_written(pTcon, total_written); @@ -1292,7 +1218,7 @@ static int cifs_writepages(struct address_space *mapping, struct pagevec pvec; int rc = 0; int scanned = 0; - int xid, long_op; + int xid; cifs_sb = CIFS_SB(mapping->host->i_sb); @@ -1430,43 +1356,67 @@ retry: break; } if (n_iov) { +retry_write: open_file = find_writable_file(CIFS_I(mapping->host), false); if (!open_file) { cERROR(1, "No writable handles for inode"); rc = -EBADF; } else { - long_op = cifs_write_timeout(cifsi, offset); rc = CIFSSMBWrite2(xid, tcon, open_file->netfid, bytes_to_write, offset, &bytes_written, iov, n_iov, - long_op); + 0); cifsFileInfo_put(open_file); - cifs_update_eof(cifsi, offset, bytes_written); } - if (rc || bytes_written < bytes_to_write) { - cERROR(1, "Write2 ret %d, wrote %d", - rc, bytes_written); - mapping_set_error(mapping, rc); - } else { + cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written); + + /* + * For now, treat a short write as if nothing got + * written. A zero length write however indicates + * ENOSPC or EFBIG. We have no way to know which + * though, so call it ENOSPC for now. EFBIG would + * get translated to AS_EIO anyway. + * + * FIXME: make it take into account the data that did + * get written + */ + if (rc == 0) { + if (bytes_written == 0) + rc = -ENOSPC; + else if (bytes_written < bytes_to_write) + rc = -EAGAIN; + } + + /* retry on data-integrity flush */ + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) + goto retry_write; + + /* fix the stats and EOF */ + if (bytes_written > 0) { cifs_stats_bytes_written(tcon, bytes_written); + cifs_update_eof(cifsi, offset, bytes_written); } for (i = 0; i < n_iov; i++) { page = pvec.pages[first + i]; - /* Should we also set page error on - success rc but too little data written? */ - /* BB investigate retry logic on temporary - server crash cases and how recovery works - when page marked as error */ - if (rc) + /* on retryable write error, redirty page */ + if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, page); + else if (rc != 0) SetPageError(page); kunmap(page); unlock_page(page); end_page_writeback(page); page_cache_release(page); } + + if (rc != -EAGAIN) + mapping_set_error(mapping, rc); + else + rc = 0; + if ((wbc->nr_to_write -= n_iov) <= 0) done = 1; index = next; @@ -1578,27 +1528,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, return rc; } -int cifs_fsync(struct file *file, int datasync) +int cifs_strict_fsync(struct file *file, int datasync) { int xid; int rc = 0; struct cifsTconInfo *tcon; struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file->f_path.dentry->d_inode; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); xid = GetXid(); cFYI(1, "Sync file - name: %s datasync: 0x%x", file->f_path.dentry->d_name.name, datasync); - rc = filemap_write_and_wait(inode->i_mapping); - if (rc == 0) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + if (!CIFS_I(inode)->clientCanCacheRead) + cifs_invalidate_mapping(inode); - tcon = tlink_tcon(smbfile->tlink); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) - rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); - } + tcon = tlink_tcon(smbfile->tlink); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) + rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); + + FreeXid(xid); + return rc; +} + +int cifs_fsync(struct file *file, int datasync) +{ + int xid; + int rc = 0; + struct cifsTconInfo *tcon; + struct cifsFileInfo *smbfile = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + + xid = GetXid(); + + cFYI(1, "Sync file - name: %s datasync: 0x%x", + file->f_path.dentry->d_name.name, datasync); + + tcon = tlink_tcon(smbfile->tlink); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) + rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); FreeXid(xid); return rc; @@ -1649,42 +1619,42 @@ int cifs_flush(struct file *file, fl_owner_t id) return rc; } -ssize_t cifs_user_read(struct file *file, char __user *read_data, - size_t read_size, loff_t *poffset) +static ssize_t +cifs_iovec_read(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *poffset) { - int rc = -EACCES; - unsigned int bytes_read = 0; - unsigned int total_read = 0; - unsigned int current_read_size; + int rc; + int xid; + unsigned int total_read, bytes_read = 0; + size_t len, cur_len; + int iov_offset = 0; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - int xid; struct cifsFileInfo *open_file; - char *smb_read_data; - char __user *current_offset; struct smb_com_read_rsp *pSMBr; + char *read_data; + + if (!nr_segs) + return 0; + + len = iov_length(iov, nr_segs); + if (!len) + return 0; xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } open_file = file->private_data; pTcon = tlink_tcon(open_file->tlink); if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); - for (total_read = 0, current_offset = read_data; - read_size > total_read; - total_read += bytes_read, current_offset += bytes_read) { - current_read_size = min_t(const int, read_size - total_read, - cifs_sb->rsize); + for (total_read = 0; total_read < len; total_read += bytes_read) { + cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); rc = -EAGAIN; - smb_read_data = NULL; + read_data = NULL; + while (rc == -EAGAIN) { int buf_type = CIFS_NO_BUFFER; if (open_file->invalidHandle) { @@ -1692,27 +1662,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, if (rc != 0) break; } - rc = CIFSSMBRead(xid, pTcon, - open_file->netfid, - current_read_size, *poffset, - &bytes_read, &smb_read_data, - &buf_type); - pSMBr = (struct smb_com_read_rsp *)smb_read_data; - if (smb_read_data) { - if (copy_to_user(current_offset, - smb_read_data + - 4 /* RFC1001 length field */ + - le16_to_cpu(pSMBr->DataOffset), - bytes_read)) + rc = CIFSSMBRead(xid, pTcon, open_file->netfid, + cur_len, *poffset, &bytes_read, + &read_data, &buf_type); + pSMBr = (struct smb_com_read_rsp *)read_data; + if (read_data) { + char *data_offset = read_data + 4 + + le16_to_cpu(pSMBr->DataOffset); + if (memcpy_toiovecend(iov, data_offset, + iov_offset, bytes_read)) rc = -EFAULT; - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); + cifs_small_buf_release(read_data); else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; + cifs_buf_release(read_data); + read_data = NULL; + iov_offset += bytes_read; } } + if (rc || (bytes_read == 0)) { if (total_read) { break; @@ -1725,13 +1693,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, *poffset += bytes_read; } } + FreeXid(xid); return total_read; } +ssize_t cifs_user_read(struct file *file, char __user *read_data, + size_t read_size, loff_t *poffset) +{ + struct iovec iov; + iov.iov_base = read_data; + iov.iov_len = read_size; + + return cifs_iovec_read(file, &iov, 1, poffset); +} + +static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t read; + + read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); + if (read > 0) + iocb->ki_pos = pos; + + return read; +} + +ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode; + + inode = iocb->ki_filp->f_path.dentry->d_inode; + + if (CIFS_I(inode)->clientCanCacheRead) + return generic_file_aio_read(iocb, iov, nr_segs, pos); + + /* + * In strict cache mode we need to read from the server all the time + * if we don't have level II oplock because the server can delay mtime + * change - so we can't make a decision about inode invalidating. + * And we can also fail with pagereading if there are mandatory locks + * on pages affected by this read but not on the region from pos to + * pos+len-1. + */ + + return cifs_user_readv(iocb, iov, nr_segs, pos); +} static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, - loff_t *poffset) + loff_t *poffset) { int rc = -EACCES; unsigned int bytes_read = 0; @@ -1799,6 +1811,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, return total_read; } +int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc, xid; + struct inode *inode = file->f_path.dentry->d_inode; + + xid = GetXid(); + + if (!CIFS_I(inode)->clientCanCacheRead) + cifs_invalidate_mapping(inode); + + rc = generic_file_mmap(file, vma); + FreeXid(xid); + return rc; +} + int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) { int rc, xid; @@ -2245,7 +2272,8 @@ void cifs_oplock_break(struct work_struct *work) */ if (!cfile->oplock_break_cancelled) { rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, - 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false); + 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, + cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 589f3e3f6e0..8852470b4fb 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -32,7 +32,7 @@ #include "fscache.h" -static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) +static void cifs_set_ops(struct inode *inode) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) inode->i_fop = &cifs_file_direct_nobrl_ops; else inode->i_fop = &cifs_file_direct_ops; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + inode->i_fop = &cifs_file_strict_nobrl_ops; + else + inode->i_fop = &cifs_file_strict_ops; } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) inode->i_fop = &cifs_file_nobrl_ops; else { /* not direct, send byte range locks */ inode->i_fop = &cifs_file_ops; } - /* check if server can support readpages */ if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) @@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) break; case S_IFDIR: #ifdef CONFIG_CIFS_DFS_UPCALL - if (is_dfs_referral) { + if (IS_AUTOMOUNT(inode)) { inode->i_op = &cifs_dfs_referral_inode_operations; } else { #else /* NO DFS support, treat as a directory */ @@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } spin_unlock(&inode->i_lock); - cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); + if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) + inode->i_flags |= S_AUTOMOUNT; + cifs_set_ops(inode); } void @@ -518,6 +524,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, fattr->cf_eof = le64_to_cpu(info->EndOfFile); fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_createtime = le64_to_cpu(info->CreationTime); if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; @@ -779,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque) if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; + /* use createtime like an i_generation field */ + if (CIFS_I(inode)->createtime != fattr->cf_createtime) + return 0; + /* don't match inode of different type */ if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) return 0; @@ -796,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque) struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; + CIFS_I(inode)->createtime = fattr->cf_createtime; return 0; } @@ -809,14 +821,14 @@ inode_has_hashed_dentries(struct inode *inode) { struct dentry *dentry; - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); list_for_each_entry(dentry, &inode->i_dentry, d_alias) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); return true; } } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); return false; } @@ -1318,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */ inc_nlink(inode); - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); cifs_fill_uniqueid(inode->i_sb, &fattr); @@ -1362,10 +1370,6 @@ mkdir_get_info: rc = cifs_get_inode_info(&newinode, full_path, NULL, inode->i_sb, xid, NULL); - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; d_instantiate(direntry, newinode); /* setting nlink not necessary except in cases where we * failed to get it from the server or was set bogus */ @@ -1679,7 +1683,7 @@ cifs_inode_needs_reval(struct inode *inode) /* * Zap the cache. Called when invalid_mapping flag is set. */ -static void +void cifs_invalidate_mapping(struct inode *inode) { int rc; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 85cdbf831e7..306769de2fb 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -524,10 +524,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", rc); } else { - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; d_instantiate(direntry, newinode); } } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 43f10281bc1..a09e077ba92 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -571,7 +571,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) pCifsInode = CIFS_I(netfile->dentry->d_inode); cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel); + pSMB->OplockLevel ? OPLOCK_READ : 0); /* * cifs_oplock_break_put() can't be called * from here. Get reference after queueing @@ -637,77 +637,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length) return; } -/* Convert 16 bit Unicode pathname to wire format from string in current code - page. Conversion may involve remapping up the seven characters that are - only legal in POSIX-like OS (if they are present in the string). Path - names are little endian 16 bit Unicode on the wire */ -int -cifsConvertToUCS(__le16 *target, const char *source, int maxlen, - const struct nls_table *cp, int mapChars) -{ - int i, j, charlen; - int len_remaining = maxlen; - char src_char; - __u16 temp; - - if (!mapChars) - return cifs_strtoUCS(target, source, PATH_MAX, cp); - - for (i = 0, j = 0; i < maxlen; j++) { - src_char = source[i]; - switch (src_char) { - case 0: - target[j] = 0; - goto ctoUCS_out; - case ':': - target[j] = cpu_to_le16(UNI_COLON); - break; - case '*': - target[j] = cpu_to_le16(UNI_ASTERIK); - break; - case '?': - target[j] = cpu_to_le16(UNI_QUESTION); - break; - case '<': - target[j] = cpu_to_le16(UNI_LESSTHAN); - break; - case '>': - target[j] = cpu_to_le16(UNI_GRTRTHAN); - break; - case '|': - target[j] = cpu_to_le16(UNI_PIPE); - break; - /* BB We can not handle remapping slash until - all the calls to build_path_from_dentry - are modified, as they use slash as separator BB */ - /* case '\\': - target[j] = cpu_to_le16(UNI_SLASH); - break;*/ - default: - charlen = cp->char2uni(source+i, - len_remaining, &temp); - /* if no match, use question mark, which - at least in some cases servers as wild card */ - if (charlen < 1) { - target[j] = cpu_to_le16(0x003f); - charlen = 1; - } else - target[j] = cpu_to_le16(temp); - len_remaining -= charlen; - /* character may take more than one byte in the - the source string, but will take exactly two - bytes in the target string */ - i += charlen; - continue; - } - i++; /* move to next char in source string */ - len_remaining--; - } - -ctoUCS_out: - return i; -} - void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 9aad47a2d62..8d9189f6447 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr) } /* else ERRHRD class errors or junk - return EIO */ - cFYI(1, "Mapping smb error code %d to POSIX err %d", - smberrcode, rc); + cFYI(1, "Mapping smb error code 0x%x to POSIX err %d", + le32_to_cpu(smb->Status.CifsError), rc); /* generic corrective action e.g. reconnect SMB session on * ERRbaduid could be added */ @@ -916,14 +916,14 @@ unsigned int smbCalcSize(struct smb_hdr *ptr) { return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + - 2 /* size of the bcc field */ + BCC(ptr)); + 2 /* size of the bcc field */ + get_bcc(ptr)); } unsigned int smbCalcSize_LE(struct smb_hdr *ptr) { return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + - 2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr))); + 2 /* size of the bcc field */ + get_bcc_le(ptr)); } /* The following are taken from fs/ntfs/util.c */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a73eb9f4bda..7f25cc3d225 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, cFYI(1, "For %s", name->name); if (parent->d_op && parent->d_op->d_hash) - parent->d_op->d_hash(parent, name); + parent->d_op->d_hash(parent, parent->d_inode, name); else name->hash = full_name_hash(name->name, name->len); @@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, return NULL; } - if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase) - dentry->d_op = &cifs_ci_dentry_ops; - else - dentry->d_op = &cifs_dentry_ops; - alias = d_materialise_unique(dentry, inode); if (alias != NULL) { dput(dentry); @@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); fattr->cf_eof = le64_to_cpu(info->EndOfFile); fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 7b01d3f6eed..1adc9625a34 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses, } static void -decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, +decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses, const struct nls_table *nls_cp) { int len; @@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, return; } -static int decode_ascii_ssetup(char **pbcc_area, int bleft, +static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses, const struct nls_table *nls_cp) { @@ -420,7 +420,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, return 0; } -#ifdef CONFIG_CIFS_EXPERIMENTAL /* BB Move to ntlmssp.c eventually */ /* We do not malloc the blob, it is passed in pbuffer, because @@ -431,13 +430,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; __u32 flags; + memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE)); memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmNegotiate; /* BB is NTLMV2 session security format easier to use here? */ flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NTLM; + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; @@ -446,7 +446,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_EXTENDED_SEC; } - sec_blob->NegotiateFlags |= cpu_to_le32(flags); + sec_blob->NegotiateFlags = cpu_to_le32(flags); sec_blob->WorkstationName.BufferOffset = 0; sec_blob->WorkstationName.Length = 0; @@ -477,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NTLM; + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) flags |= NTLMSSP_NEGOTIATE_SIGN; @@ -485,7 +485,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); - sec_blob->NegotiateFlags |= cpu_to_le32(flags); + sec_blob->NegotiateFlags = cpu_to_le32(flags); sec_blob->LmChallengeResponse.BufferOffset = cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE)); @@ -544,8 +544,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; - if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && - !calc_seckey(ses)) { + if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) || + (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) + && !calc_seckey(ses)) { memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); @@ -563,17 +564,6 @@ setup_ntlmv2_ret: return rc; } - -static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB, - struct cifsSesInfo *ses) -{ - build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses); - pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); - - return; -} -#endif - int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const struct nls_table *nls_cp) @@ -585,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, char *str_area; SESSION_SETUP_ANDX *pSMB; __u32 capabilities; - int count; + __u16 count; int resp_buf_type; struct kvec iov[3]; enum securityEnum type; - __u16 action; - int bytes_remaining; + __u16 action, bytes_remaining; struct key *spnego_key = NULL; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ u16 blob_len; @@ -814,71 +803,70 @@ ssetup_ntlmssp_authenticate: rc = -ENOSYS; goto ssetup_exit; #endif /* CONFIG_CIFS_UPCALL */ - } else { -#ifdef CONFIG_CIFS_EXPERIMENTAL - if (type == RawNTLMSSP) { - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cERROR(1, "NTLMSSP requires Unicode support"); - rc = -ENOSYS; + } else if (type == RawNTLMSSP) { + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cERROR(1, "NTLMSSP requires Unicode support"); + rc = -ENOSYS; + goto ssetup_exit; + } + + cFYI(1, "ntlmssp session setup phase %d", phase); + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + switch(phase) { + case NtLmNegotiate: + build_ntlmssp_negotiate_blob( + pSMB->req.SecurityBlob, ses); + iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = + cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + break; + case NtLmAuthenticate: + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc( + 5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + cERROR(1, "Can't allocate NTLMSSP blob"); + rc = -ENOMEM; goto ssetup_exit; } - cFYI(1, "ntlmssp session setup phase %d", phase); - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities |= cpu_to_le32(capabilities); - if (phase == NtLmNegotiate) { - setup_ntlmssp_neg_req(pSMB, ses); - iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); - iov[1].iov_base = &pSMB->req.SecurityBlob[0]; - } else if (phase == NtLmAuthenticate) { - /* 5 is an empirical value, large enought to - * hold authenticate message, max 10 of - * av paris, doamin,user,workstation mames, - * flags etc.. - */ - ntlmsspblob = kmalloc( - 5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - cERROR(1, "Can't allocate NTLMSSP"); - rc = -ENOMEM; - goto ssetup_exit; - } - - rc = build_ntlmssp_auth_blob(ntlmsspblob, - &blob_len, ses, nls_cp); - if (rc) - goto ssetup_exit; - iov[1].iov_len = blob_len; - iov[1].iov_base = ntlmsspblob; - pSMB->req.SecurityBlobLength = - cpu_to_le16(blob_len); - /* Make sure that we tell the server that we - are using the uid that it just gave us back - on the response (challenge) */ - smb_buf->Uid = ses->Suid; - } else { - cERROR(1, "invalid phase %d", phase); - rc = -ENOSYS; + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, nls_cp); + if (rc) goto ssetup_exit; - } - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_oslm_strings(&bcc_ptr, nls_cp); - } else { - cERROR(1, "secType %d not supported!", type); + iov[1].iov_len = blob_len; + iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + break; + default: + cERROR(1, "invalid phase %d", phase); rc = -ENOSYS; goto ssetup_exit; } -#else + /* unicode strings must be word aligned */ + if ((iov[0].iov_len + iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, nls_cp); + } else { cERROR(1, "secType %d not supported!", type); rc = -ENOSYS; goto ssetup_exit; -#endif } iov[2].iov_base = str_area; @@ -887,10 +875,10 @@ ssetup_ntlmssp_authenticate: count = iov[1].iov_len + iov[2].iov_len; smb_buf->smb_buf_length += count; - BCC_LE(smb_buf) = cpu_to_le16(count); + put_bcc_le(count, smb_buf); rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, - CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); + CIFS_LOG_ERROR); /* SMB request buf freed in SendReceive2 */ pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; @@ -921,7 +909,7 @@ ssetup_ntlmssp_authenticate: cFYI(1, "UID = %d ", ses->Suid); /* response can have either 3 or 4 word count - Samba sends 3 */ /* and lanman response is 3 */ - bytes_remaining = BCC(smb_buf); + bytes_remaining = get_bcc(smb_buf); bcc_ptr = pByteArea(smb_buf); if (smb_buf->WordCount == 4) { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e0588cdf4cc..c1ccca1a933 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -36,7 +36,13 @@ extern mempool_t *cifs_mid_poolp; -static struct mid_q_entry * +static void +wake_up_task(struct mid_q_entry *mid) +{ + wake_up_process(mid->callback_data); +} + +struct mid_q_entry * AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) { struct mid_q_entry *temp; @@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ /* when mid allocated can be before when sent */ temp->when_alloc = jiffies; - temp->tsk = current; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = wake_up_task; + temp->callback_data = current; } - spin_lock(&GlobalMid_Lock); - list_add_tail(&temp->qhead, &server->pending_mid_q); atomic_inc(&midCount); temp->midState = MID_REQUEST_ALLOCATED; - spin_unlock(&GlobalMid_Lock); return temp; } -static void +void DeleteMidQEntry(struct mid_q_entry *midEntry) { #ifdef CONFIG_CIFS_STATS2 unsigned long now; #endif - spin_lock(&GlobalMid_Lock); midEntry->midState = MID_FREE; - list_del(&midEntry->qhead); atomic_dec(&midCount); - spin_unlock(&GlobalMid_Lock); if (midEntry->largeBuf) cifs_buf_release(midEntry->resp_buf); else @@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) mempool_free(midEntry, cifs_mid_poolp); } +static void +delete_mid(struct mid_q_entry *mid) +{ + spin_lock(&GlobalMid_Lock); + list_del(&mid->qhead); + spin_unlock(&GlobalMid_Lock); + + DeleteMidQEntry(mid); +} + static int smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) { @@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) if (ssocket == NULL) return -ENOTSOCK; /* BB eventually add reconnect code here */ - smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr; + smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; @@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, return smb_sendv(server, &iov, 1); } -static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) +static int wait_for_free_request(struct TCP_Server_Info *server, + const int long_op) { if (long_op == CIFS_ASYNC_OP) { /* oplock breaks must not be held up */ - atomic_inc(&ses->server->inFlight); + atomic_inc(&server->inFlight); return 0; } spin_lock(&GlobalMid_Lock); while (1) { - if (atomic_read(&ses->server->inFlight) >= - cifs_max_pending){ + if (atomic_read(&server->inFlight) >= cifs_max_pending) { spin_unlock(&GlobalMid_Lock); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->num_waiters); + atomic_inc(&server->num_waiters); #endif - wait_event(ses->server->request_q, - atomic_read(&ses->server->inFlight) + wait_event(server->request_q, + atomic_read(&server->inFlight) < cifs_max_pending); #ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->num_waiters); + atomic_dec(&server->num_waiters); #endif spin_lock(&GlobalMid_Lock); } else { - if (ses->server->tcpStatus == CifsExiting) { + if (server->tcpStatus == CifsExiting) { spin_unlock(&GlobalMid_Lock); return -ENOENT; } @@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) /* update # of requests on the wire to server */ if (long_op != CIFS_BLOCKING_OP) - atomic_inc(&ses->server->inFlight); + atomic_inc(&server->inFlight); spin_unlock(&GlobalMid_Lock); break; } @@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf, *ppmidQ = AllocMidQEntry(in_buf, ses->server); if (*ppmidQ == NULL) return -ENOMEM; + spin_lock(&GlobalMid_Lock); + list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); return 0; } -static int wait_for_response(struct cifsSesInfo *ses, - struct mid_q_entry *midQ, - unsigned long timeout, - unsigned long time_to_wait) +static int +wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { - unsigned long curr_timeout; + int error; - for (;;) { - curr_timeout = timeout + jiffies; - wait_event_timeout(ses->server->response_q, - midQ->midState != MID_REQUEST_SUBMITTED, timeout); + error = wait_event_killable(server->response_q, + midQ->midState != MID_REQUEST_SUBMITTED); + if (error < 0) + return -ERESTARTSYS; - if (time_after(jiffies, curr_timeout) && - (midQ->midState == MID_REQUEST_SUBMITTED) && - ((ses->server->tcpStatus == CifsGood) || - (ses->server->tcpStatus == CifsNew))) { + return 0; +} - unsigned long lrt; - /* We timed out. Is the server still - sending replies ? */ - spin_lock(&GlobalMid_Lock); - lrt = ses->server->lstrp; - spin_unlock(&GlobalMid_Lock); +/* + * Send a SMB request and set the callback function in the mid to handle + * the result. Caller is responsible for dealing with timeouts. + */ +int +cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf, + mid_callback_t *callback, void *cbdata) +{ + int rc; + struct mid_q_entry *mid; - /* Calculate time_to_wait past last receive time. - Although we prefer not to time out if the - server is still responding - we will time - out if the server takes more than 15 (or 45 - or 180) seconds to respond to this request - and has not responded to any request from - other threads on the client within 10 seconds */ - lrt += time_to_wait; - if (time_after(jiffies, lrt)) { - /* No replies for time_to_wait. */ - cERROR(1, "server not responding"); - return -1; - } - } else { - return 0; - } + rc = wait_for_free_request(server, CIFS_ASYNC_OP); + if (rc) + return rc; + + mutex_lock(&server->srv_mutex); + mid = AllocMidQEntry(in_buf, server); + if (mid == NULL) { + mutex_unlock(&server->srv_mutex); + return -ENOMEM; } -} + /* put it on the pending_mid_q */ + spin_lock(&GlobalMid_Lock); + list_add_tail(&mid->qhead, &server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + + rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); + if (rc) { + mutex_unlock(&server->srv_mutex); + goto out_err; + } + + mid->callback = callback; + mid->callback_data = cbdata; + mid->midState = MID_REQUEST_SUBMITTED; +#ifdef CONFIG_CIFS_STATS2 + atomic_inc(&server->inSend); +#endif + rc = smb_send(server, in_buf, in_buf->smb_buf_length); +#ifdef CONFIG_CIFS_STATS2 + atomic_dec(&server->inSend); + mid->when_sent = jiffies; +#endif + mutex_unlock(&server->srv_mutex); + if (rc) + goto out_err; + + return rc; +out_err: + delete_mid(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); + return rc; +} /* * @@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses, return rc; } +static int +sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) +{ + int rc = 0; + + cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command, + mid->mid, mid->midState); + + spin_lock(&GlobalMid_Lock); + /* ensure that it's no longer on the pending_mid_q */ + list_del_init(&mid->qhead); + + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + spin_unlock(&GlobalMid_Lock); + return rc; + case MID_REQUEST_SUBMITTED: + /* socket is going down, reject all calls */ + if (server->tcpStatus == CifsExiting) { + cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d", + __func__, mid->mid, mid->command, mid->midState); + rc = -EHOSTDOWN; + break; + } + case MID_RETRY_NEEDED: + rc = -EAGAIN; + break; + default: + cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, + mid->mid, mid->midState); + rc = -EIO; + } + spin_unlock(&GlobalMid_Lock); + + DeleteMidQEntry(mid); + return rc; +} + +/* + * An NT cancel request header looks just like the original request except: + * + * The Command is SMB_COM_NT_CANCEL + * The WordCount is zeroed out + * The ByteCount is zeroed out + * + * This function mangles an existing request buffer into a + * SMB_COM_NT_CANCEL request and then sends it. + */ +static int +send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf, + struct mid_q_entry *mid) +{ + int rc = 0; + + /* -4 for RFC1001 length and +2 for BCC field */ + in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2; + in_buf->Command = SMB_COM_NT_CANCEL; + in_buf->WordCount = 0; + put_bcc_le(0, in_buf); + + mutex_lock(&server->srv_mutex); + rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); + if (rc) { + mutex_unlock(&server->srv_mutex); + return rc; + } + rc = smb_send(server, in_buf, in_buf->smb_buf_length); + mutex_unlock(&server->srv_mutex); + + cFYI(1, "issued NT_CANCEL for mid %u, rc = %d", + in_buf->Mid, rc); + + return rc; +} + int SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, struct kvec *iov, int n_vec, int *pRespBufType /* ret */, @@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, int rc = 0; int long_op; unsigned int receive_len; - unsigned long timeout; struct mid_q_entry *midQ; struct smb_hdr *in_buf = iov[0].iov_base; @@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, to the same server. We may make this configurable later or use ses->maxReq */ - rc = wait_for_free_request(ses, long_op); + rc = wait_for_free_request(ses->server, long_op); if (rc) { cifs_small_buf_release(in_buf); return rc; @@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, if (rc < 0) goto out; - if (long_op == CIFS_STD_OP) - timeout = 15 * HZ; - else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */ - timeout = 180 * HZ; - else if (long_op == CIFS_LONG_OP) - timeout = 45 * HZ; /* should be greater than - servers oplock break timeout (about 43 seconds) */ - else if (long_op == CIFS_ASYNC_OP) + if (long_op == CIFS_ASYNC_OP) goto out; - else if (long_op == CIFS_BLOCKING_OP) - timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */ - else { - cERROR(1, "unknown timeout flag %d", long_op); - rc = -EIO; - goto out; - } - - /* wait for 15 seconds or until woken up due to response arriving or - due to last connection to this server being unmounted */ - if (signal_pending(current)) { - /* if signal pending do not hold up user for full smb timeout - but we still give response a chance to complete */ - timeout = 2 * HZ; - } - - /* No user interrupts in wait - wreaks havoc with performance */ - wait_for_response(ses, midQ, timeout, 10 * HZ); - - spin_lock(&GlobalMid_Lock); - if (midQ->resp_buf == NULL) { - cERROR(1, "No response to cmd %d mid %d", - midQ->command, midQ->mid); - if (midQ->midState == MID_REQUEST_SUBMITTED) { - if (ses->server->tcpStatus == CifsExiting) - rc = -EHOSTDOWN; - else { - ses->server->tcpStatus = CifsNeedReconnect; - midQ->midState = MID_RETRY_NEEDED; - } - } + rc = wait_for_response(ses->server, midQ); + if (rc != 0) + goto out; - if (rc != -EHOSTDOWN) { - if (midQ->midState == MID_RETRY_NEEDED) { - rc = -EAGAIN; - cFYI(1, "marking request for retry"); - } else { - rc = -EIO; - } - } - spin_unlock(&GlobalMid_Lock); - DeleteMidQEntry(midQ); - /* Update # of requests on wire to server */ + rc = sync_mid_result(midQ, ses->server); + if (rc != 0) { atomic_dec(&ses->server->inFlight); wake_up(&ses->server->request_q); return rc; } - spin_unlock(&GlobalMid_Lock); receive_len = midQ->resp_buf->smb_buf_length; if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { @@ -559,19 +632,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, if (receive_len >= sizeof(struct smb_hdr) - 4 /* do not count RFC1001 header */ + (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ ) - BCC(midQ->resp_buf) = - le16_to_cpu(BCC_LE(midQ->resp_buf)); + put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf); if ((flags & CIFS_NO_RESP) == 0) midQ->resp_buf = NULL; /* mark it so buf will not be freed by - DeleteMidQEntry */ + delete_mid */ } else { rc = -EIO; cFYI(1, "Bad MID state?"); } out: - DeleteMidQEntry(midQ); + delete_mid(midQ); atomic_dec(&ses->server->inFlight); wake_up(&ses->server->request_q); @@ -585,7 +657,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, { int rc = 0; unsigned int receive_len; - unsigned long timeout; struct mid_q_entry *midQ; if (ses == NULL) { @@ -610,7 +681,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, return -EIO; } - rc = wait_for_free_request(ses, long_op); + rc = wait_for_free_request(ses->server, long_op); if (rc) return rc; @@ -649,64 +720,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, if (rc < 0) goto out; - if (long_op == CIFS_STD_OP) - timeout = 15 * HZ; - /* wait for 15 seconds or until woken up due to response arriving or - due to last connection to this server being unmounted */ - else if (long_op == CIFS_ASYNC_OP) + if (long_op == CIFS_ASYNC_OP) goto out; - else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */ - timeout = 180 * HZ; - else if (long_op == CIFS_LONG_OP) - timeout = 45 * HZ; /* should be greater than - servers oplock break timeout (about 43 seconds) */ - else if (long_op == CIFS_BLOCKING_OP) - timeout = 0x7FFFFFFF; /* large but no so large as to wrap */ - else { - cERROR(1, "unknown timeout flag %d", long_op); - rc = -EIO; - goto out; - } - if (signal_pending(current)) { - /* if signal pending do not hold up user for full smb timeout - but we still give response a chance to complete */ - timeout = 2 * HZ; - } - - /* No user interrupts in wait - wreaks havoc with performance */ - wait_for_response(ses, midQ, timeout, 10 * HZ); - - spin_lock(&GlobalMid_Lock); - if (midQ->resp_buf == NULL) { - cERROR(1, "No response for cmd %d mid %d", - midQ->command, midQ->mid); - if (midQ->midState == MID_REQUEST_SUBMITTED) { - if (ses->server->tcpStatus == CifsExiting) - rc = -EHOSTDOWN; - else { - ses->server->tcpStatus = CifsNeedReconnect; - midQ->midState = MID_RETRY_NEEDED; - } - } + rc = wait_for_response(ses->server, midQ); + if (rc != 0) + goto out; - if (rc != -EHOSTDOWN) { - if (midQ->midState == MID_RETRY_NEEDED) { - rc = -EAGAIN; - cFYI(1, "marking request for retry"); - } else { - rc = -EIO; - } - } - spin_unlock(&GlobalMid_Lock); - DeleteMidQEntry(midQ); - /* Update # of requests on wire to server */ + rc = sync_mid_result(midQ, ses->server); + if (rc != 0) { atomic_dec(&ses->server->inFlight); wake_up(&ses->server->request_q); return rc; } - spin_unlock(&GlobalMid_Lock); receive_len = midQ->resp_buf->smb_buf_length; if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { @@ -748,43 +775,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, if (receive_len >= sizeof(struct smb_hdr) - 4 /* do not count RFC1001 header */ + (2 * out_buf->WordCount) + 2 /* bcc */ ) - BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); + put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf); } else { rc = -EIO; cERROR(1, "Bad MID state?"); } out: - DeleteMidQEntry(midQ); + delete_mid(midQ); atomic_dec(&ses->server->inFlight); wake_up(&ses->server->request_q); return rc; } -/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */ - -static int -send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf, - struct mid_q_entry *midQ) -{ - int rc = 0; - struct cifsSesInfo *ses = tcon->ses; - __u16 mid = in_buf->Mid; - - header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0); - in_buf->Mid = mid; - mutex_lock(&ses->server->srv_mutex); - rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); - if (rc) { - mutex_unlock(&ses->server->srv_mutex); - return rc; - } - rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length); - mutex_unlock(&ses->server->srv_mutex); - return rc; -} - /* We send a LOCKINGX_CANCEL_LOCK to cause the Windows blocking lock to return. */ @@ -807,7 +811,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon, pSMB->hdr.Mid = GetNextMid(ses->server); return SendReceive(xid, ses, in_buf, out_buf, - &bytes_returned, CIFS_STD_OP); + &bytes_returned, 0); } int @@ -845,7 +849,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, return -EIO; } - rc = wait_for_free_request(ses, CIFS_BLOCKING_OP); + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP); if (rc) return rc; @@ -863,7 +867,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); if (rc) { - DeleteMidQEntry(midQ); + delete_mid(midQ); mutex_unlock(&ses->server->srv_mutex); return rc; } @@ -880,7 +884,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { - DeleteMidQEntry(midQ); + delete_mid(midQ); return rc; } @@ -899,10 +903,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, if (in_buf->Command == SMB_COM_TRANSACTION2) { /* POSIX lock. We send a NT_CANCEL SMB to cause the blocking lock to return. */ - - rc = send_nt_cancel(tcon, in_buf, midQ); + rc = send_nt_cancel(ses->server, in_buf, midQ); if (rc) { - DeleteMidQEntry(midQ); + delete_mid(midQ); return rc; } } else { @@ -914,47 +917,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, /* If we get -ENOLCK back the lock may have already been removed. Don't exit in this case. */ if (rc && rc != -ENOLCK) { - DeleteMidQEntry(midQ); + delete_mid(midQ); return rc; } } - /* Wait 5 seconds for the response. */ - if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) { + if (wait_for_response(ses->server, midQ) == 0) { /* We got the response - restart system call. */ rstart = 1; } } - spin_lock(&GlobalMid_Lock); - if (midQ->resp_buf) { - spin_unlock(&GlobalMid_Lock); - receive_len = midQ->resp_buf->smb_buf_length; - } else { - cERROR(1, "No response for cmd %d mid %d", - midQ->command, midQ->mid); - if (midQ->midState == MID_REQUEST_SUBMITTED) { - if (ses->server->tcpStatus == CifsExiting) - rc = -EHOSTDOWN; - else { - ses->server->tcpStatus = CifsNeedReconnect; - midQ->midState = MID_RETRY_NEEDED; - } - } - - if (rc != -EHOSTDOWN) { - if (midQ->midState == MID_RETRY_NEEDED) { - rc = -EAGAIN; - cFYI(1, "marking request for retry"); - } else { - rc = -EIO; - } - } - spin_unlock(&GlobalMid_Lock); - DeleteMidQEntry(midQ); + rc = sync_mid_result(midQ, ses->server); + if (rc != 0) return rc; - } + receive_len = midQ->resp_buf->smb_buf_length; if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { cERROR(1, "Frame too large received. Length: %d Xid: %d", receive_len, xid); @@ -998,10 +976,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, if (receive_len >= sizeof(struct smb_hdr) - 4 /* do not count RFC1001 header */ + (2 * out_buf->WordCount) + 2 /* bcc */ ) - BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); + put_bcc(get_bcc_le(out_buf), out_buf); out: - DeleteMidQEntry(midQ); + delete_mid(midQ); if (rstart && rc == -EACCES) return -ERESTARTSYS; return rc; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 9060f08e70c..69015787618 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -20,10 +20,9 @@ #include <linux/spinlock.h> #include <linux/coda.h> -#include <linux/coda_linux.h> #include <linux/coda_psdev.h> -#include <linux/coda_fs_i.h> -#include <linux/coda_cache.h> +#include "coda_linux.h" +#include "coda_cache.h" static atomic_t permission_epoch = ATOMIC_INIT(0); @@ -93,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag) struct list_head *child; struct dentry *de; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); list_for_each(child, &parent->d_subdirs) { de = list_entry(child, struct dentry, d_u.d_child); @@ -102,7 +101,7 @@ static void coda_flag_children(struct dentry *parent, int flag) continue; coda_flag_inode(de->d_inode, flag); } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); return; } diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 602240569c8..6475877b076 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -7,9 +7,8 @@ #include <linux/time.h> #include <linux/coda.h> -#include <linux/coda_linux.h> -#include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include "coda_linux.h" static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) { diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h new file mode 100644 index 00000000000..c910b5eb1ce --- /dev/null +++ b/fs/coda/coda_cache.h @@ -0,0 +1,22 @@ +/* Coda filesystem -- Linux Minicache + * + * Copyright (C) 1989 - 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this software to + * contribute improvements to the Coda project. Contact Peter Braam + * <coda@cs.cmu.edu> + */ + +#ifndef _CFSNC_HEADER_ +#define _CFSNC_HEADER_ + +/* credential cache */ +void coda_cache_enter(struct inode *inode, int mask); +void coda_cache_clear_inode(struct inode *); +void coda_cache_clear_all(struct super_block *sb); +int coda_cache_check(struct inode *inode, int mask); + +/* for downcalls and attributes and lookups */ +void coda_flag_inode_children(struct inode *inode, int flag); + +#endif /* _CFSNC_HEADER_ */ diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h new file mode 100644 index 00000000000..e35071b1de0 --- /dev/null +++ b/fs/coda/coda_fs_i.h @@ -0,0 +1,58 @@ +/* + * coda_fs_i.h + * + * Copyright (C) 1998 Carnegie Mellon University + * + */ + +#ifndef _LINUX_CODA_FS_I +#define _LINUX_CODA_FS_I + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/coda.h> + +/* + * coda fs inode data + * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and + * c_cached_perm. + * vfs_inode is set only when the inode is created and never changes. + * c_fid is set when the inode is created and should be considered immutable. + */ +struct coda_inode_info { + struct CodaFid c_fid; /* Coda identifier */ + u_short c_flags; /* flags (see below) */ + unsigned int c_mapcount; /* nr of times this inode is mapped */ + unsigned int c_cached_epoch; /* epoch for cached permissions */ + vuid_t c_uid; /* fsuid for cached permissions */ + unsigned int c_cached_perm; /* cached access permissions */ + spinlock_t c_lock; + struct inode vfs_inode; +}; + +/* + * coda fs file private data + */ +#define CODA_MAGIC 0xC0DAC0DA +struct coda_file_info { + int cfi_magic; /* magic number */ + struct file *cfi_container; /* container file for this cnode */ + unsigned int cfi_mapcount; /* nr of times this file is mapped */ +}; + +#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data)) + +/* flags */ +#define C_VATTR 0x1 /* Validity of vattr in inode */ +#define C_FLUSH 0x2 /* used after a flush */ +#define C_DYING 0x4 /* from venus (which died) */ +#define C_PURGE 0x8 + +int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *); +struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); +int coda_cnode_makectl(struct inode **inode, struct super_block *sb); +struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); +void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); + +#endif diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index bf4a3fd3c8e..2bdbcc11b37 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -17,9 +17,8 @@ #include <linux/string.h> #include <linux/coda.h> -#include <linux/coda_linux.h> #include <linux/coda_psdev.h> -#include <linux/coda_fs_i.h> +#include "coda_linux.h" /* initialize the debugging variables */ int coda_fake_statfs; diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h new file mode 100644 index 00000000000..9b0c5323890 --- /dev/null +++ b/fs/coda/coda_linux.h @@ -0,0 +1,101 @@ +/* + * Coda File System, Linux Kernel module + * + * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University + * Linux modifications (C) 1996, Peter J. Braam + * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University + * + * Carnegie Mellon University encourages users of this software to + * contribute improvements to the Coda project. + */ + +#ifndef _LINUX_CODA_FS +#define _LINUX_CODA_FS + +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/types.h> +#include <linux/fs.h> +#include "coda_fs_i.h" + +/* operations */ +extern const struct inode_operations coda_dir_inode_operations; +extern const struct inode_operations coda_file_inode_operations; +extern const struct inode_operations coda_ioctl_inode_operations; + +extern const struct dentry_operations coda_dentry_operations; + +extern const struct address_space_operations coda_file_aops; +extern const struct address_space_operations coda_symlink_aops; + +extern const struct file_operations coda_dir_operations; +extern const struct file_operations coda_file_operations; +extern const struct file_operations coda_ioctl_operations; + +/* operations shared over more than one file */ +int coda_open(struct inode *i, struct file *f); +int coda_release(struct inode *i, struct file *f); +int coda_permission(struct inode *inode, int mask, unsigned int flags); +int coda_revalidate_inode(struct dentry *); +int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); +int coda_setattr(struct dentry *, struct iattr *); + +/* this file: heloers */ +char *coda_f2s(struct CodaFid *f); +int coda_isroot(struct inode *i); +int coda_iscontrol(const char *name, size_t length); + +void coda_vattr_to_iattr(struct inode *, struct coda_vattr *); +void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *); +unsigned short coda_flags_to_cflags(unsigned short); + +/* sysctl.h */ +void coda_sysctl_init(void); +void coda_sysctl_clean(void); + +#define CODA_ALLOC(ptr, cast, size) do { \ + if (size < PAGE_SIZE) \ + ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ + else \ + ptr = (cast)vmalloc((unsigned long) size); \ + if (!ptr) \ + printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ + else memset( ptr, 0, size ); \ +} while (0) + + +#define CODA_FREE(ptr,size) \ + do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) + +/* inode to cnode access functions */ + +static inline struct coda_inode_info *ITOC(struct inode *inode) +{ + return list_entry(inode, struct coda_inode_info, vfs_inode); +} + +static __inline__ struct CodaFid *coda_i2f(struct inode *inode) +{ + return &(ITOC(inode)->c_fid); +} + +static __inline__ char *coda_i2s(struct inode *inode) +{ + return coda_f2s(&(ITOC(inode)->c_fid)); +} + +/* this will not zap the inode away */ +static __inline__ void coda_flag_inode(struct inode *inode, int flag) +{ + struct coda_inode_info *cii = ITOC(inode); + + spin_lock(&cii->c_lock); + cii->c_flags |= flag; + spin_unlock(&cii->c_lock); +} + +#endif diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 5d8b3553960..2b8dae4d121 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -18,14 +18,14 @@ #include <linux/errno.h> #include <linux/string.h> #include <linux/spinlock.h> +#include <linux/namei.h> #include <asm/uaccess.h> #include <linux/coda.h> -#include <linux/coda_linux.h> #include <linux/coda_psdev.h> -#include <linux/coda_fs_i.h> -#include <linux/coda_cache.h> +#include "coda_linux.h" +#include "coda_cache.h" #include "coda_int.h" @@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir); /* dentry ops */ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd); -static int coda_dentry_delete(struct dentry *); +static int coda_dentry_delete(const struct dentry *); /* support routines */ static int coda_venus_readdir(struct file *coda_file, void *buf, @@ -60,7 +60,7 @@ static int coda_return_EIO(void) } #define CODA_EIO_ERROR ((void *) (coda_return_EIO)) -static const struct dentry_operations coda_dentry_operations = +const struct dentry_operations coda_dentry_operations = { .d_revalidate = coda_dentry_revalidate, .d_delete = coda_dentry_delete, @@ -125,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc return ERR_PTR(error); exit: - entry->d_op = &coda_dentry_operations; - if (inode && (type & CODA_NOCACHE)) coda_flag_inode(inode, C_VATTR | C_PURGE); @@ -134,10 +132,13 @@ exit: } -int coda_permission(struct inode *inode, int mask) +int coda_permission(struct inode *inode, int mask, unsigned int flags) { int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (!mask) @@ -541,9 +542,13 @@ out: /* called when a cache lookup succeeds */ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) { - struct inode *inode = de->d_inode; + struct inode *inode; struct coda_inode_info *cii; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = de->d_inode; if (!inode || coda_isroot(inode)) goto out; if (is_bad_inode(inode)) @@ -559,7 +564,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) if (cii->c_flags & C_FLUSH) coda_flag_inode_children(inode, C_FLUSH); - if (atomic_read(&de->d_count) > 1) + if (de->d_count > 1) /* pretend it's valid, but don't change the flags */ goto out; @@ -577,7 +582,7 @@ out: * This is the callback from dput() when d_count is going to 0. * We use this to unhash dentries with bad inodes. */ -static int coda_dentry_delete(struct dentry * dentry) +static int coda_dentry_delete(const struct dentry * dentry) { int flags; diff --git a/fs/coda/file.c b/fs/coda/file.c index c8b50ba4366..0433057be33 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -21,10 +21,9 @@ #include <asm/uaccess.h> #include <linux/coda.h> -#include <linux/coda_linux.h> -#include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include "coda_linux.h" #include "coda_int.h" static ssize_t diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 5ea57c8c7f9..871b2771546 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -28,10 +28,9 @@ #include <linux/vmalloc.h> #include <linux/coda.h> -#include <linux/coda_linux.h> #include <linux/coda_psdev.h> -#include <linux/coda_fs_i.h> -#include <linux/coda_cache.h> +#include "coda_linux.h" +#include "coda_cache.h" #include "coda_int.h" @@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep; static struct inode *coda_alloc_inode(struct super_block *sb) { struct coda_inode_info *ei; - ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->c_fid, 0, sizeof(struct CodaFid)); @@ -56,11 +55,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void coda_destroy_inode(struct inode *inode) +static void coda_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(coda_inode_cachep, ITOC(inode)); } +static void coda_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, coda_i_callback); +} + static void init_once(void *foo) { struct coda_inode_info *ei = (struct coda_inode_info *) foo; @@ -186,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; + sb->s_d_op = &coda_dentry_operations; sb->s_bdi = &vc->bdi; /* get root fid from Venus: this needs the root inode */ diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 2fd89b5c5c7..6cbb3afb36d 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -19,12 +19,12 @@ #include <asm/uaccess.h> #include <linux/coda.h> -#include <linux/coda_linux.h> -#include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include "coda_linux.h" + /* pioctl ops */ -static int coda_ioctl_permission(struct inode *inode, int mask); +static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct inode *inode, int mask) +static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 62647a8595e..8f616e0e252 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -43,10 +43,10 @@ #include <asm/uaccess.h> #include <linux/coda.h> -#include <linux/coda_linux.h> -#include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include "coda_linux.h" + #include "coda_int.h" /* statistics */ diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index af78f007a2b..ab94ef63cae 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -16,9 +16,9 @@ #include <linux/pagemap.h> #include <linux/coda.h> -#include <linux/coda_linux.h> #include <linux/coda_psdev.h> -#include <linux/coda_fs_i.h> + +#include "coda_linux.h" static int coda_symlink_filler(struct file *file, struct page *page) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index c3563cab975..9727e0c5257 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -33,10 +33,9 @@ #include <linux/vfs.h> #include <linux/coda.h> -#include <linux/coda_linux.h> #include <linux/coda_psdev.h> -#include <linux/coda_fs_i.h> -#include <linux/coda_cache.h> +#include "coda_linux.h" +#include "coda_cache.h" #include "coda_int.h" diff --git a/fs/compat.c b/fs/compat.c index eb1740ac8c0..f6fd0a00e6c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * } /* - * The following statfs calls are copies of code from fs/open.c and + * The following statfs calls are copies of code from fs/statfs.c and * should be checked against those from time to time */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) @@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat __put_user(kbuf->f_namelen, &ubuf->f_namelen) || __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize)) + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) return -EFAULT; return 0; } @@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type, if (nr_segs > fast_segs) { ret = -ENOMEM; iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) { - *ret_pointer = fast_pointer; + if (iov == NULL) goto out; - } } *ret_pointer = iov; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a60579b007b..61abb638b4b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -42,7 +42,7 @@ #include <linux/tty.h> #include <linux/vt_kern.h> #include <linux/fb.h> -#include <linux/videodev.h> +#include <linux/videodev2.h> #include <linux/netdevice.h> #include <linux/raw.h> #include <linux/blkdev.h> @@ -836,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW) COMPATIBLE_IOCTL(TCSETSF) COMPATIBLE_IOCTL(TIOCLINUX) COMPATIBLE_IOCTL(TIOCSBRK) +COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig index 13587cc97a0..9febcdefdfd 100644 --- a/fs/configfs/Kconfig +++ b/fs/configfs/Kconfig @@ -1,8 +1,8 @@ config CONFIGFS_FS tristate "Userspace-driven configuration filesystem" - depends on SYSFS + select SYSFS help - configfs is a ram-based filesystem that provides the converse + configfs is a RAM-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based view of kernel objects, configfs is a filesystem-based manager of kernel objects, or config_items. diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index da6061a6df4..82bda8fdfc1 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations; extern const struct file_operations bin_fops; extern const struct inode_operations configfs_dir_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; +extern const struct dentry_operations configfs_dentry_ops; extern int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); @@ -120,7 +121,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry { struct config_item * item = NULL; - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { struct configfs_dirent * sd = dentry->d_fsdata; if (sd->s_type & CONFIGFS_ITEM_LINK) { @@ -129,7 +130,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry } else item = config_item_get(sd->s_element); } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return item; } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 0b502f80c69..90ff3cb10de 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -67,12 +67,12 @@ static void configfs_d_iput(struct dentry * dentry, * We _must_ delete our dentries on last dput, as the chain-to-parent * behavior is required to clear the parents of default_groups. */ -static int configfs_d_delete(struct dentry *dentry) +static int configfs_d_delete(const struct dentry *dentry) { return 1; } -static const struct dentry_operations configfs_dentry_ops = { +const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, /* simple_delete_dentry() isn't exported */ .d_delete = configfs_d_delete, @@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd, sd->s_mode = mode; sd->s_dentry = dentry; - if (dentry) { + if (dentry) dentry->d_fsdata = configfs_get(sd); - dentry->d_op = &configfs_dentry_ops; - } return 0; } @@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p, error = configfs_create(d, mode, init_dir); if (!error) { inc_nlink(p->d_inode); - (d)->d_op = &configfs_dentry_ops; } else { struct configfs_dirent *sd = d->d_fsdata; if (sd) { @@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl, CONFIGFS_ITEM_LINK); if (!err) { err = configfs_create(dentry, mode, init_symlink); - if (!err) - dentry->d_op = &configfs_dentry_ops; - else { + if (err) { struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { spin_lock(&configfs_dirent_lock); @@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d) if (d->d_inode) simple_rmdir(parent->d_inode,d); - pr_debug(" o %s removing done (%d)\n",d->d_name.name, - atomic_read(&d->d_count)); + pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); dput(parent); } @@ -448,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den return error; } - dentry->d_op = &configfs_dentry_ops; d_rehash(dentry); return 0; @@ -493,7 +486,10 @@ static struct dentry * configfs_lookup(struct inode *dir, * If it doesn't exist and it isn't a NOT_PINNED item, * it must be negative. */ - return simple_lookup(dir, dentry, nd); + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; } out: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 253476d78ed..c83f4768eea 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) struct dentry * dentry = sd->s_dentry; if (dentry) { - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { - dget_locked(dentry); + dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else { + } else spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - } } } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 7d3607febe1..ecc62178bed 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) configfs_root_group.cg_item.ci_dentry = root; root->d_fsdata = &configfs_root; sb->s_root = root; + sb->s_d_op = &configfs_dentry_ops; /* the rest get that */ return 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 32fd5fe9ca0..e141939080f 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops; static DEFINE_MUTEX(read_mutex); -/* These two macros may change in future, to provide better st_ino - semantics. */ -#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1) +/* These macros may change in future, to provide better st_ino semantics. */ #define OFFSET(x) ((x)->i_ino) -static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode) +static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset) { + if (!cino->offset) + return offset + 1; + if (!cino->size) + return offset + 1; + + /* + * The file mode test fixes buggy mkcramfs implementations where + * cramfs_inode->offset is set to a non zero value for entries + * which did not contain data, like devices node and fifos. + */ + switch (cino->mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + return cino->offset << 2; + default: + break; + } + return offset + 1; +} + +static struct inode *get_cramfs_inode(struct super_block *sb, + struct cramfs_inode *cramfs_inode, unsigned int offset) +{ + struct inode *inode; static struct timespec zerotime; + + inode = iget_locked(sb, cramino(cramfs_inode, offset)); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + switch (cramfs_inode->mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + break; + case S_IFDIR: + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + break; + default: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); + } + inode->i_mode = cramfs_inode->mode; inode->i_uid = cramfs_inode->uid; - inode->i_size = cramfs_inode->size; - inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; inode->i_gid = cramfs_inode->gid; + + /* if the lower 2 bits are zero, the inode contains data */ + if (!(inode->i_ino & 3)) { + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + } + /* Struct copy intentional */ inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even without -noleaf option. */ - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &generic_ro_fops; - inode->i_data.a_ops = &cramfs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &cramfs_dir_inode_operations; - inode->i_fop = &cramfs_directory_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_data.a_ops = &cramfs_aops; - } else { - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } -} -static struct inode *get_cramfs_inode(struct super_block *sb, - struct cramfs_inode * cramfs_inode) -{ - struct inode *inode; - if (CRAMINO(cramfs_inode) == 1) { - inode = new_inode(sb); - if (inode) { - inode->i_ino = 1; - setup_inode(inode, cramfs_inode); - } - } else { - inode = iget_locked(sb, CRAMINO(cramfs_inode)); - if (inode && (inode->i_state & I_NEW)) { - setup_inode(inode, cramfs_inode); - unlock_new_inode(inode); - } - } + unlock_new_inode(inode); + return inode; } @@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "cramfs: root is not a directory\n"); goto out; } + /* correct strange, hard-coded permissions of mkcramfs */ + super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + root_offset = super.root.offset << 2; if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { sbi->size=super.size; @@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* Set it all up.. */ sb->s_op = &cramfs_ops; - root = get_cramfs_inode(sb, &super.root); + root = get_cramfs_inode(sb, &super.root, 0); if (!root) goto out; sb->s_root = d_alloc_root(root); @@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) */ namelen = de->namelen << 2; memcpy(buf, name, namelen); - ino = CRAMINO(de); + ino = cramino(de, OFFSET(inode) + offset); mode = de->mode; mutex_unlock(&read_mutex); nextoffset = offset + sizeof(*de) + namelen; @@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s struct cramfs_inode *de; char *name; int namelen, retval; + int dir_off = OFFSET(dir) + offset; - de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); + de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN); name = (char *)(de+1); /* Try to take advantage of sorted directories */ @@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s if (!retval) { struct cramfs_inode entry = *de; mutex_unlock(&read_mutex); - d_add(dentry, get_cramfs_inode(dir->i_sb, &entry)); + d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off)); return NULL; } /* else (retval < 0) */ diff --git a/fs/dcache.c b/fs/dcache.c index 23702a9d4e6..2a6bd9a4ae9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -33,20 +33,58 @@ #include <linux/bootmem.h> #include <linux/fs_struct.h> #include <linux/hardirq.h> +#include <linux/bit_spinlock.h> +#include <linux/rculist_bl.h> #include "internal.h" +/* + * Usage: + * dcache->d_inode->i_lock protects: + * - i_dentry, d_alias, d_inode of aliases + * dcache_hash_bucket lock protects: + * - the dcache hash table + * s_anon bl list spinlock protects: + * - the s_anon list (see __d_drop) + * dcache_lru_lock protects: + * - the dcache lru lists and counters + * d_lock protects: + * - d_flags + * - d_name + * - d_lru + * - d_count + * - d_unhashed() + * - d_parent and d_subdirs + * - childrens' d_child and d_parent + * - d_alias, d_inode + * + * Ordering: + * dentry->d_inode->i_lock + * dentry->d_lock + * dcache_lru_lock + * dcache_hash_bucket lock + * s_anon lock + * + * If there is an ancestor relationship: + * dentry->d_parent->...->d_parent->d_lock + * ... + * dentry->d_parent->d_lock + * dentry->d_lock + * + * If no ancestor relationship: + * if (dentry1 < dentry2) + * dentry1->d_lock + * dentry2->d_lock + */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); -EXPORT_SYMBOL(dcache_lock); +EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; -#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) - /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -60,22 +98,51 @@ static struct kmem_cache *dentry_cache __read_mostly; static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; -static struct hlist_head *dentry_hashtable __read_mostly; + +struct dcache_hash_bucket { + struct hlist_bl_head head; +}; +static struct dcache_hash_bucket *dentry_hashtable __read_mostly; + +static inline struct dcache_hash_bucket *d_hash(struct dentry *parent, + unsigned long hash) +{ + hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; + hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); + return dentry_hashtable + (hash & D_HASHMASK); +} + +static inline void spin_lock_bucket(struct dcache_hash_bucket *b) +{ + bit_spin_lock(0, (unsigned long *)&b->head.first); +} + +static inline void spin_unlock_bucket(struct dcache_hash_bucket *b) +{ + __bit_spin_unlock(0, (unsigned long *)&b->head.first); +} /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static struct percpu_counter nr_dentry __cacheline_aligned_in_smp; -static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp; +static DEFINE_PER_CPU(unsigned int, nr_dentry); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +static int get_nr_dentry(void) +{ + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry); - dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + dentry_stat.nr_dentry = get_nr_dentry(); return proc_dointvec(table, write, buffer, lenp, ppos); } #endif @@ -91,35 +158,51 @@ static void __d_free(struct rcu_head *head) } /* - * no dcache_lock, please. + * no locks, please. */ static void d_free(struct dentry *dentry) { - percpu_counter_dec(&nr_dentry); + BUG_ON(dentry->d_count); + this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); /* if dentry was never inserted into hash, immediate free is OK */ - if (hlist_unhashed(&dentry->d_hash)) + if (hlist_bl_unhashed(&dentry->d_hash)) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } +/** + * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * @dentry: the target dentry + * After this call, in-progress rcu-walk path lookup will fail. This + * should be called after unhashing, and after changing d_inode (if + * the dentry has not already been unhashed). + */ +static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +{ + assert_spin_locked(&dentry->d_lock); + /* Go through a barrier */ + write_seqcount_barrier(&dentry->d_seq); +} + /* * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. + * d_iput() operation if defined. Dentry has no refcount + * and is unhashed. */ static void dentry_iput(struct dentry * dentry) __releases(dentry->d_lock) - __releases(dcache_lock) + __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; list_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) @@ -128,65 +211,191 @@ static void dentry_iput(struct dentry * dentry) iput(inode); } else { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } } /* - * dentry_lru_(add|del|move_tail) must be called with dcache_lock held. + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. dentry remains in-use. + */ +static void dentry_unlink_inode(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dentry->d_inode->i_lock) +{ + struct inode *inode = dentry->d_inode; + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); +} + +/* + * dentry_lru_(add|del|move_tail) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { if (list_empty(&dentry->d_lru)) { + spin_lock(&dcache_lru_lock); list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - percpu_counter_inc(&nr_dentry_unused); + dentry_stat.nr_unused++; + spin_unlock(&dcache_lru_lock); } } +static void __dentry_lru_del(struct dentry *dentry) +{ + list_del_init(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; +} + static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - list_del_init(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - percpu_counter_dec(&nr_dentry_unused); + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); } } static void dentry_lru_move_tail(struct dentry *dentry) { + spin_lock(&dcache_lru_lock); if (list_empty(&dentry->d_lru)) { list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - percpu_counter_inc(&nr_dentry_unused); + dentry_stat.nr_unused++; } else { list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); } + spin_unlock(&dcache_lru_lock); } /** * d_kill - kill dentry and return parent * @dentry: dentry to kill + * @parent: parent dentry * * The dentry must already be unhashed and removed from the LRU. * * If this is the root of the dentry tree, return NULL. + * + * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by + * d_kill. */ -static struct dentry *d_kill(struct dentry *dentry) +static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(dentry->d_lock) - __releases(dcache_lock) + __releases(parent->d_lock) + __releases(dentry->d_inode->i_lock) { - struct dentry *parent; - + dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); - /*drops the locks, at that point nobody can reach this dentry */ + if (parent) + spin_unlock(&parent->d_lock); dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + d_free(dentry); + return parent; +} + +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent dentry hashes, so that it won't + * be found through a VFS lookup any more. Note that this is different from + * deleting the dentry - d_delete will try to mark the dentry negative if + * possible, giving a successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants to invalidate a dentry for some + * reason (NFS timeouts or autofs deletes). + * + * __d_drop requires dentry->d_lock. + */ +void __d_drop(struct dentry *dentry) +{ + if (!(dentry->d_flags & DCACHE_UNHASHED)) { + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) { + bit_spin_lock(0, + (unsigned long *)&dentry->d_sb->s_anon.first); + dentry->d_flags |= DCACHE_UNHASHED; + hlist_bl_del_init(&dentry->d_hash); + __bit_spin_unlock(0, + (unsigned long *)&dentry->d_sb->s_anon.first); + } else { + struct dcache_hash_bucket *b; + b = d_hash(dentry->d_parent, dentry->d_name.hash); + spin_lock_bucket(b); + /* + * We may not actually need to put DCACHE_UNHASHED + * manipulations under the hash lock, but follow + * the principle of least surprise. + */ + dentry->d_flags |= DCACHE_UNHASHED; + hlist_bl_del_rcu(&dentry->d_hash); + spin_unlock_bucket(b); + dentry_rcuwalk_barrier(dentry); + } + } +} +EXPORT_SYMBOL(__d_drop); + +void d_drop(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_drop); + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) + __releases(dentry->d_lock) +{ + struct inode *inode; + struct dentry *parent; + + inode = dentry->d_inode; + if (inode && !spin_trylock(&inode->i_lock)) { +relock: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ + } if (IS_ROOT(dentry)) parent = NULL; else parent = dentry->d_parent; - d_free(dentry); - return parent; + if (parent && !spin_trylock(&parent->d_lock)) { + if (inode) + spin_unlock(&inode->i_lock); + goto relock; + } + + if (ref) + dentry->d_count--; + /* if dentry was on the d_lru list delete it from there */ + dentry_lru_del(dentry); + /* if it was on the hash then remove it */ + __d_drop(dentry); + return d_kill(dentry, parent); } /* @@ -214,34 +423,26 @@ static struct dentry *d_kill(struct dentry *dentry) * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. - * - * no dcache lock, please. */ - void dput(struct dentry *dentry) { if (!dentry) return; repeat: - if (atomic_read(&dentry->d_count) == 1) + if (dentry->d_count == 1) might_sleep(); - if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { + BUG_ON(!dentry->d_count); + if (dentry->d_count > 1) { + dentry->d_count--; spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return; } - /* - * AV: ->d_delete() is _NOT_ allowed to block now. - */ - if (dentry->d_op && dentry->d_op->d_delete) { + if (dentry->d_flags & DCACHE_OP_DELETE) { if (dentry->d_op->d_delete(dentry)) - goto unhash_it; + goto kill_it; } /* Unreachable? Get rid of it */ @@ -252,16 +453,12 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + dentry->d_count--; + spin_unlock(&dentry->d_lock); return; -unhash_it: - __d_drop(dentry); kill_it: - /* if dentry was on the d_lru list delete it from there */ - dentry_lru_del(dentry); - dentry = d_kill(dentry); + dentry = dentry_kill(dentry, 1); if (dentry) goto repeat; } @@ -284,9 +481,9 @@ int d_invalidate(struct dentry * dentry) /* * If it's already been dropped, return OK. */ - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return 0; } /* @@ -294,9 +491,9 @@ int d_invalidate(struct dentry * dentry) * to get rid of unused child entries. */ if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); shrink_dcache_parent(dentry); - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); } /* @@ -309,35 +506,61 @@ int d_invalidate(struct dentry * dentry) * we might still populate it if it was a * working directory or similar). */ - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1) { + if (dentry->d_count > 1) { if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return -EBUSY; } } __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 0; } EXPORT_SYMBOL(d_invalidate); -/* This should be called _only_ with dcache_lock held */ -static inline struct dentry * __dget_locked(struct dentry *dentry) +/* This must be called with d_lock held */ +static inline void __dget_dlock(struct dentry *dentry) { - atomic_inc(&dentry->d_count); - dentry_lru_del(dentry); - return dentry; + dentry->d_count++; } -struct dentry * dget_locked(struct dentry *dentry) +static inline void __dget(struct dentry *dentry) { - return __dget_locked(dentry); + spin_lock(&dentry->d_lock); + __dget_dlock(dentry); + spin_unlock(&dentry->d_lock); +} + +struct dentry *dget_parent(struct dentry *dentry) +{ + struct dentry *ret; + +repeat: + /* + * Don't need rcu_dereference because we re-check it was correct under + * the lock. + */ + rcu_read_lock(); + ret = dentry->d_parent; + if (!ret) { + rcu_read_unlock(); + goto out; + } + spin_lock(&ret->d_lock); + if (unlikely(ret != dentry->d_parent)) { + spin_unlock(&ret->d_lock); + rcu_read_unlock(); + goto repeat; + } + rcu_read_unlock(); + BUG_ON(!ret->d_count); + ret->d_count++; + spin_unlock(&ret->d_lock); +out: + return ret; } -EXPORT_SYMBOL(dget_locked); +EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode @@ -355,42 +578,51 @@ EXPORT_SYMBOL(dget_locked); * any other hashed alias over that one unless @want_discon is set, * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. */ - -static struct dentry * __d_find_alias(struct inode *inode, int want_discon) +static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { - struct list_head *head, *next, *tmp; - struct dentry *alias, *discon_alias=NULL; + struct dentry *alias, *discon_alias; - head = &inode->i_dentry; - next = inode->i_dentry.next; - while (next != head) { - tmp = next; - next = tmp->next; - prefetch(next); - alias = list_entry(tmp, struct dentry, d_alias); +again: + discon_alias = NULL; + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) + (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - else if (!want_discon) { - __dget_locked(alias); + } else if (!want_discon) { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; + } + } + spin_unlock(&alias->d_lock); + } + if (discon_alias) { + alias = discon_alias; + spin_lock(&alias->d_lock); + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + if (IS_ROOT(alias) && + (alias->d_flags & DCACHE_DISCONNECTED)) { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); return alias; } } + spin_unlock(&alias->d_lock); + goto again; } - if (discon_alias) - __dget_locked(discon_alias); - return discon_alias; + return NULL; } -struct dentry * d_find_alias(struct inode *inode) +struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!list_empty(&inode->i_dentry)) { - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); de = __d_find_alias(inode, 0); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } return de; } @@ -404,54 +636,61 @@ void d_prune_aliases(struct inode *inode) { struct dentry *dentry; restart: - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); list_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); - if (!atomic_read(&dentry->d_count)) { - __dget_locked(dentry); + if (!dentry->d_count) { + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); dput(dentry); goto restart; } spin_unlock(&dentry->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_prune_aliases); /* - * Throw away a dentry - free the inode, dput the parent. This requires that - * the LRU list has already been removed. + * Try to throw away a dentry - free the inode, dput the parent. + * Requires dentry->d_lock is held, and dentry->d_count == 0. + * Releases dentry->d_lock. * - * Try to prune ancestors as well. This is necessary to prevent - * quadratic behavior of shrink_dcache_parent(), but is also expected - * to be beneficial in reducing dentry cache fragmentation. + * This may fail if locks cannot be acquired no problem, just try again. */ -static void prune_one_dentry(struct dentry * dentry) +static void try_prune_one_dentry(struct dentry *dentry) __releases(dentry->d_lock) - __releases(dcache_lock) - __acquires(dcache_lock) { - __d_drop(dentry); - dentry = d_kill(dentry); + struct dentry *parent; + parent = dentry_kill(dentry, 0); /* - * Prune ancestors. Locking is simpler than in dput(), - * because dcache_lock needs to be taken anyway. + * If dentry_kill returns NULL, we have nothing more to do. + * if it returns the same dentry, trylocks failed. In either + * case, just loop again. + * + * Otherwise, we need to prune ancestors too. This is necessary + * to prevent quadratic behavior of shrink_dcache_parent(), but + * is also expected to be beneficial in reducing dentry cache + * fragmentation. */ - spin_lock(&dcache_lock); + if (!parent) + return; + if (parent == dentry) + return; + + /* Prune ancestors. */ + dentry = parent; while (dentry) { - if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) + spin_lock(&dentry->d_lock); + if (dentry->d_count > 1) { + dentry->d_count--; + spin_unlock(&dentry->d_lock); return; - - if (dentry->d_op && dentry->d_op->d_delete) - dentry->d_op->d_delete(dentry); - dentry_lru_del(dentry); - __d_drop(dentry); - dentry = d_kill(dentry); - spin_lock(&dcache_lock); + } + dentry = dentry_kill(dentry, 1); } } @@ -459,24 +698,35 @@ static void shrink_dentry_list(struct list_head *list) { struct dentry *dentry; - while (!list_empty(list)) { - dentry = list_entry(list->prev, struct dentry, d_lru); - dentry_lru_del(dentry); + rcu_read_lock(); + for (;;) { + dentry = list_entry_rcu(list->prev, struct dentry, d_lru); + if (&dentry->d_lru == list) + break; /* empty */ + spin_lock(&dentry->d_lock); + if (dentry != list_entry(list->prev, struct dentry, d_lru)) { + spin_unlock(&dentry->d_lock); + continue; + } /* * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { + if (dentry->d_count) { + dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; } - prune_one_dentry(dentry); - /* dentry->d_lock was dropped in prune_one_dentry() */ - cond_resched_lock(&dcache_lock); + + rcu_read_unlock(); + + try_prune_one_dentry(dentry); + + rcu_read_lock(); } + rcu_read_unlock(); } /** @@ -495,42 +745,44 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) LIST_HEAD(tmp); int cnt = *count; - spin_lock(&dcache_lock); +relock: + spin_lock(&dcache_lru_lock); while (!list_empty(&sb->s_dentry_lru)) { dentry = list_entry(sb->s_dentry_lru.prev, struct dentry, d_lru); BUG_ON(dentry->d_sb != sb); + if (!spin_trylock(&dentry->d_lock)) { + spin_unlock(&dcache_lru_lock); + cpu_relax(); + goto relock; + } + /* * If we are honouring the DCACHE_REFERENCED flag and the * dentry has this flag set, don't free it. Clear the flag * and put it back on the LRU. */ - if (flags & DCACHE_REFERENCED) { - spin_lock(&dentry->d_lock); - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - cond_resched_lock(&dcache_lock); - continue; - } + if (flags & DCACHE_REFERENCED && + dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_move(&dentry->d_lru, &referenced); spin_unlock(&dentry->d_lock); + } else { + list_move_tail(&dentry->d_lru, &tmp); + spin_unlock(&dentry->d_lock); + if (!--cnt) + break; } - - list_move_tail(&dentry->d_lru, &tmp); - if (!--cnt) - break; - cond_resched_lock(&dcache_lock); + cond_resched_lock(&dcache_lru_lock); } - - *count = cnt; - shrink_dentry_list(&tmp); - if (!list_empty(&referenced)) list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lock); + spin_unlock(&dcache_lru_lock); + shrink_dentry_list(&tmp); + + *count = cnt; } /** @@ -546,13 +798,12 @@ static void prune_dcache(int count) { struct super_block *sb, *p = NULL; int w_count; - int unused = percpu_counter_sum_positive(&nr_dentry_unused); + int unused = dentry_stat.nr_unused; int prune_ratio; int pruned; if (unused == 0 || count == 0) return; - spin_lock(&dcache_lock); if (count >= unused) prune_ratio = 1; else @@ -589,11 +840,9 @@ static void prune_dcache(int count) if (down_read_trylock(&sb->s_umount)) { if ((sb->s_root != NULL) && (!list_empty(&sb->s_dentry_lru))) { - spin_unlock(&dcache_lock); __shrink_dcache_sb(sb, &w_count, DCACHE_REFERENCED); pruned -= w_count; - spin_lock(&dcache_lock); } up_read(&sb->s_umount); } @@ -609,7 +858,6 @@ static void prune_dcache(int count) if (p) __put_super(p); spin_unlock(&sb_lock); - spin_unlock(&dcache_lock); } /** @@ -623,12 +871,14 @@ void shrink_dcache_sb(struct super_block *sb) { LIST_HEAD(tmp); - spin_lock(&dcache_lock); + spin_lock(&dcache_lru_lock); while (!list_empty(&sb->s_dentry_lru)) { list_splice_init(&sb->s_dentry_lru, &tmp); + spin_unlock(&dcache_lru_lock); shrink_dentry_list(&tmp); + spin_lock(&dcache_lru_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&dcache_lru_lock); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -645,10 +895,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) BUG_ON(!IS_ROOT(dentry)); /* detach this root from the system */ - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); dentry_lru_del(dentry); __d_drop(dentry); - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); for (;;) { /* descend to the first leaf in the current subtree */ @@ -657,14 +907,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* this is a branch with children - detach all of them * from the system in one go */ - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { + spin_lock_nested(&loop->d_lock, + DENTRY_D_LOCK_NESTED); dentry_lru_del(loop); __d_drop(loop); - cond_resched_lock(&dcache_lock); + spin_unlock(&loop->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); /* move to the first child */ dentry = list_entry(dentry->d_subdirs.next, @@ -676,7 +928,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) do { struct inode *inode; - if (atomic_read(&dentry->d_count) != 0) { + if (dentry->d_count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" " still in use (%d)" @@ -685,20 +937,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry->d_name.name, - atomic_read(&dentry->d_count), + dentry->d_count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); BUG(); } - if (IS_ROOT(dentry)) + if (IS_ROOT(dentry)) { parent = NULL; - else { + list_del(&dentry->d_u.d_child); + } else { parent = dentry->d_parent; - atomic_dec(&parent->d_count); + spin_lock(&parent->d_lock); + parent->d_count--; + list_del(&dentry->d_u.d_child); + spin_unlock(&parent->d_lock); } - list_del(&dentry->d_u.d_child); detached++; inode = dentry->d_inode; @@ -728,8 +983,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* * destroy the dentries attached to a superblock on unmounting - * - we don't need to use dentry->d_lock, and only need dcache_lock when - * removing the dentry from the system lists and hashes because: + * - we don't need to use dentry->d_lock because: * - the superblock is detached from all mountings and open files, so the * dentry trees will not be rearranged by the VFS * - s_umount is write-locked, so the memory pressure shrinker will ignore @@ -746,11 +1000,13 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - atomic_dec(&dentry->d_count); + spin_lock(&dentry->d_lock); + dentry->d_count--; + spin_unlock(&dentry->d_lock); shrink_dcache_for_umount_subtree(dentry); - while (!hlist_empty(&sb->s_anon)) { - dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash); + while (!hlist_bl_empty(&sb->s_anon)) { + dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); shrink_dcache_for_umount_subtree(dentry); } } @@ -768,15 +1024,20 @@ void shrink_dcache_for_umount(struct super_block *sb) * Return true if the parent or its subdirectories contain * a mount point */ - int have_submounts(struct dentry *parent) { - struct dentry *this_parent = parent; + struct dentry *this_parent; struct list_head *next; + unsigned seq; + int locked = 0; + + seq = read_seqbegin(&rename_lock); +again: + this_parent = parent; - spin_lock(&dcache_lock); if (d_mountpoint(parent)) goto positive; + spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -784,27 +1045,65 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) + if (d_mountpoint(dentry)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&this_parent->d_lock); goto positive; + } if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } + spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + /* might go back up the wrong parent if we have had a rename + * or deletion */ + if (this_parent != child->d_parent || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + rcu_read_unlock(); + next = child->d_u.d_child.next; goto resume; } - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); return 0; /* No mount points found in tree */ positive: - spin_unlock(&dcache_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); return 1; + +rename_retry: + locked = 1; + write_seqlock(&rename_lock); + goto again; } EXPORT_SYMBOL(have_submounts); @@ -824,11 +1123,16 @@ EXPORT_SYMBOL(have_submounts); */ static int select_parent(struct dentry * parent) { - struct dentry *this_parent = parent; + struct dentry *this_parent; struct list_head *next; + unsigned seq; int found = 0; + int locked = 0; - spin_lock(&dcache_lock); + seq = read_seqbegin(&rename_lock); +again: + this_parent = parent; + spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -837,11 +1141,13 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ - if (!atomic_read(&dentry->d_count)) { + if (!dentry->d_count) { dentry_lru_move_tail(dentry); found++; } else { @@ -853,28 +1159,63 @@ resume: * ensures forward progress). We'll be coming back to find * the rest. */ - if (found && need_resched()) + if (found && need_resched()) { + spin_unlock(&dentry->d_lock); goto out; + } /* * Descend a level if the d_subdirs list is non-empty. */ if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } + + spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + /* might go back up the wrong parent if we have had a rename + * or deletion */ + if (this_parent != child->d_parent || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + rcu_read_unlock(); + next = child->d_u.d_child.next; goto resume; } out: - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); return found; + +rename_retry: + if (found) + return found; + locked = 1; + write_seqlock(&rename_lock); + goto again; } /** @@ -908,16 +1249,13 @@ EXPORT_SYMBOL(shrink_dcache_parent); */ static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { - int nr_unused; - if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; prune_dcache(nr); } - nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); - return (nr_unused / 100) * sysctl_vfs_cache_pressure; + return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } static struct shrinker dcache_shrinker = { @@ -960,38 +1298,54 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) memcpy(dname, name->name, name->len); dname[name->len] = 0; - atomic_set(&dentry->d_count, 1); + dentry->d_count = 1; dentry->d_flags = DCACHE_UNHASHED; spin_lock_init(&dentry->d_lock); + seqcount_init(&dentry->d_seq); dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; dentry->d_op = NULL; dentry->d_fsdata = NULL; - dentry->d_mounted = 0; - INIT_HLIST_NODE(&dentry->d_hash); + INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); INIT_LIST_HEAD(&dentry->d_alias); + INIT_LIST_HEAD(&dentry->d_u.d_child); if (parent) { - dentry->d_parent = dget(parent); + spin_lock(&parent->d_lock); + /* + * don't need child lock because it is not subject + * to concurrency here + */ + __dget_dlock(parent); + dentry->d_parent = parent; dentry->d_sb = parent->d_sb; - } else { - INIT_LIST_HEAD(&dentry->d_u.d_child); - } - - spin_lock(&dcache_lock); - if (parent) + d_set_d_op(dentry, dentry->d_sb->s_d_op); list_add(&dentry->d_u.d_child, &parent->d_subdirs); - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); + } - percpu_counter_inc(&nr_dentry); + this_cpu_inc(nr_dentry); return dentry; } EXPORT_SYMBOL(d_alloc); +struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) +{ + struct dentry *dentry = d_alloc(NULL, name); + if (dentry) { + dentry->d_sb = sb; + d_set_d_op(dentry, dentry->d_sb->s_d_op); + dentry->d_parent = dentry; + dentry->d_flags |= DCACHE_DISCONNECTED; + } + return dentry; +} +EXPORT_SYMBOL(d_alloc_pseudo); + struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; @@ -1003,12 +1357,39 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) } EXPORT_SYMBOL(d_alloc_name); -/* the caller must hold dcache_lock */ +void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +{ + WARN_ON_ONCE(dentry->d_op); + WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | + DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | + DCACHE_OP_DELETE )); + dentry->d_op = op; + if (!op) + return; + if (op->d_hash) + dentry->d_flags |= DCACHE_OP_HASH; + if (op->d_compare) + dentry->d_flags |= DCACHE_OP_COMPARE; + if (op->d_revalidate) + dentry->d_flags |= DCACHE_OP_REVALIDATE; + if (op->d_delete) + dentry->d_flags |= DCACHE_OP_DELETE; + +} +EXPORT_SYMBOL(d_set_d_op); + static void __d_instantiate(struct dentry *dentry, struct inode *inode) { - if (inode) + spin_lock(&dentry->d_lock); + if (inode) { + if (unlikely(IS_AUTOMOUNT(inode))) + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; list_add(&dentry->d_alias, &inode->i_dentry); + } dentry->d_inode = inode; + dentry_rcuwalk_barrier(dentry); + spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } @@ -1030,9 +1411,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); + if (inode) + spin_lock(&inode->i_lock); __d_instantiate(entry, inode); - spin_unlock(&dcache_lock); + if (inode) + spin_unlock(&inode->i_lock); security_d_instantiate(entry, inode); } EXPORT_SYMBOL(d_instantiate); @@ -1069,15 +1452,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, list_for_each_entry(alias, &inode->i_dentry, d_alias) { struct qstr *qstr = &alias->d_name; + /* + * Don't need alias->d_lock here, because aliases with + * d_parent == entry->d_parent are not subject to name or + * parent changes, because the parent inode i_mutex is held. + */ if (qstr->hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; - if (qstr->len != len) + if (dentry_cmp(qstr->name, qstr->len, name, len)) continue; - if (memcmp(qstr->name, name, len)) - continue; - dget_locked(alias); + __dget(alias); return alias; } @@ -1091,9 +1477,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); + if (inode) + spin_lock(&inode->i_lock); result = __d_instantiate_unique(entry, inode); - spin_unlock(&dcache_lock); + if (inode) + spin_unlock(&inode->i_lock); if (!result) { security_d_instantiate(entry, inode); @@ -1126,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode) res = d_alloc(NULL, &name); if (res) { res->d_sb = root_inode->i_sb; + d_set_d_op(res, res->d_sb->s_d_op); res->d_parent = res; d_instantiate(res, root_inode); } @@ -1134,14 +1523,6 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); -static inline struct hlist_head *d_hash(struct dentry *parent, - unsigned long hash) -{ - hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; - hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); -} - /** * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for @@ -1182,10 +1563,11 @@ struct dentry *d_obtain_alias(struct inode *inode) } tmp->d_parent = tmp; /* make sure dput doesn't croak */ - spin_lock(&dcache_lock); + + spin_lock(&inode->i_lock); res = __d_find_alias(inode, 0); if (res) { - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); dput(tmp); goto out_iput; } @@ -1193,14 +1575,17 @@ struct dentry *d_obtain_alias(struct inode *inode) /* attach a disconnected dentry */ spin_lock(&tmp->d_lock); tmp->d_sb = inode->i_sb; + d_set_d_op(tmp, tmp->d_sb->s_d_op); tmp->d_inode = inode; tmp->d_flags |= DCACHE_DISCONNECTED; - tmp->d_flags &= ~DCACHE_UNHASHED; list_add(&tmp->d_alias, &inode->i_dentry); - hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon); + bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first); + tmp->d_flags &= ~DCACHE_UNHASHED; + hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); + __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first); spin_unlock(&tmp->d_lock); + spin_unlock(&inode->i_lock); - spin_unlock(&dcache_lock); return tmp; out_iput: @@ -1230,18 +1615,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) struct dentry *new = NULL; if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); new = __d_find_alias(inode, 1); if (new) { BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); d_move(new, dentry); iput(inode); } else { - /* already taking dcache_lock, so d_add() by hand */ + /* already taking inode->i_lock, so d_add() by hand */ __d_instantiate(dentry, inode); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); security_d_instantiate(dentry, inode); d_rehash(dentry); } @@ -1314,10 +1699,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, * Negative dentry: instantiate it unless the inode is a directory and * already has a dentry. */ - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { __d_instantiate(found, inode); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); security_d_instantiate(found, inode); return found; } @@ -1327,8 +1712,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, * reference to it, move it in place and use it. */ new = list_entry(inode->i_dentry.next, struct dentry, d_alias); - dget_locked(new); - spin_unlock(&dcache_lock); + __dget(new); + spin_unlock(&inode->i_lock); security_d_instantiate(found, inode); d_move(new, found); iput(inode); @@ -1342,6 +1727,112 @@ err_out: EXPORT_SYMBOL(d_add_ci); /** + * __d_lookup_rcu - search for a dentry (racy, store-free) + * @parent: parent dentry + * @name: qstr of name we wish to find + * @seq: returns d_seq value at the point where the dentry was found + * @inode: returns dentry->d_inode when the inode was found valid. + * Returns: dentry, or NULL + * + * __d_lookup_rcu is the dcache lookup function for rcu-walk name + * resolution (store-free path walking) design described in + * Documentation/filesystems/path-lookup.txt. + * + * This is not to be used outside core vfs. + * + * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock + * held, and rcu_read_lock held. The returned dentry must not be stored into + * without taking d_lock and checking d_seq sequence count against @seq + * returned here. + * + * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * function. + * + * Alternatively, __d_lookup_rcu may be called again to look up the child of + * the returned dentry, so long as its parent's seqlock is checked after the + * child is looked up. Thus, an interlocking stepping of sequence lock checks + * is formed, giving integrity down the path walk. + */ +struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, + unsigned *seq, struct inode **inode) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct dcache_hash_bucket *b = d_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Carefully use d_seq when comparing a candidate dentry, to avoid + * races with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/vfs/dcache-locking.txt for more details. + */ + hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { + struct inode *i; + const char *tname; + int tlen; + + if (dentry->d_name.hash != hash) + continue; + +seqretry: + *seq = read_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + i = dentry->d_inode; + prefetch(tname); + if (i) + prefetch(i); + /* + * This seqcount check is required to ensure name and + * len are loaded atomically, so as not to walk off the + * edge of memory when walking. If we could load this + * atomically some other way, we could drop this check. + */ + if (read_seqcount_retry(&dentry->d_seq, *seq)) + goto seqretry; + if (parent->d_flags & DCACHE_OP_COMPARE) { + if (parent->d_op->d_compare(parent, *inode, + dentry, i, + tlen, tname, name)) + continue; + } else { + if (dentry_cmp(tname, tlen, str, len)) + continue; + } + /* + * No extra seqcount check is required after the name + * compare. The caller must perform a seqcount check in + * order to do anything useful with the returned dentry + * anyway. + */ + *inode = i; + return dentry; + } + return NULL; +} + +/** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find @@ -1352,10 +1843,10 @@ EXPORT_SYMBOL(d_add_ci); * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ -struct dentry * d_lookup(struct dentry * parent, struct qstr * name) +struct dentry *d_lookup(struct dentry *parent, struct qstr *name) { - struct dentry * dentry = NULL; - unsigned long seq; + struct dentry *dentry; + unsigned seq; do { seq = read_seqbegin(&rename_lock); @@ -1367,7 +1858,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name) } EXPORT_SYMBOL(d_lookup); -/* +/** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find @@ -1382,17 +1873,24 @@ EXPORT_SYMBOL(d_lookup); * * __d_lookup callers must be commented. */ -struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) +struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) { unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; - struct hlist_head *head = d_hash(parent,hash); + struct dcache_hash_bucket *b = d_hash(parent, hash); + struct hlist_bl_node *node; struct dentry *found = NULL; - struct hlist_node *node; struct dentry *dentry; /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races @@ -1407,25 +1905,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) */ rcu_read_lock(); - hlist_for_each_entry_rcu(dentry, node, head, d_hash) { - struct qstr *qstr; + hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { + const char *tname; + int tlen; if (dentry->d_name.hash != hash) continue; - if (dentry->d_parent != parent) - continue; spin_lock(&dentry->d_lock); - - /* - * Recheck the dentry after taking the lock - d_move may have - * changed things. Don't bother checking the hash because - * we're about to compare the whole name anyway. - */ if (dentry->d_parent != parent) goto next; - - /* non-existing due to RCU? */ if (d_unhashed(dentry)) goto next; @@ -1433,18 +1922,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). */ - qstr = &dentry->d_name; - if (parent->d_op && parent->d_op->d_compare) { - if (parent->d_op->d_compare(parent, qstr, name)) + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + if (parent->d_flags & DCACHE_OP_COMPARE) { + if (parent->d_op->d_compare(parent, parent->d_inode, + dentry, dentry->d_inode, + tlen, tname, name)) goto next; } else { - if (qstr->len != len) - goto next; - if (memcmp(qstr->name, str, len)) + if (dentry_cmp(tname, tlen, str, len)) goto next; } - atomic_inc(&dentry->d_count); + dentry->d_count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -1473,8 +1963,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(name->name, name->len); - if (dir->d_op && dir->d_op->d_hash) { - if (dir->d_op->d_hash(dir, name) < 0) + if (dir->d_flags & DCACHE_OP_HASH) { + if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0) goto out; } dentry = d_lookup(dir, name); @@ -1483,34 +1973,32 @@ out: } /** - * d_validate - verify dentry provided from insecure source + * d_validate - verify dentry provided from insecure source (deprecated) * @dentry: The dentry alleged to be valid child of @dparent * @dparent: The parent dentry (known to be valid) * * An insecure source has sent us a dentry, here we verify it and dget() it. * This is used by ncpfs in its readdir implementation. * Zero is returned in the dentry is invalid. + * + * This function is slow for big directories, and deprecated, do not use it. */ -int d_validate(struct dentry *dentry, struct dentry *parent) +int d_validate(struct dentry *dentry, struct dentry *dparent) { - struct hlist_head *head = d_hash(parent, dentry->d_name.hash); - struct hlist_node *node; - struct dentry *d; + struct dentry *child; - /* Check whether the ptr might be valid at all.. */ - if (!kmem_ptr_validate(dentry_cache, dentry)) - return 0; - if (dentry->d_parent != parent) - return 0; - - rcu_read_lock(); - hlist_for_each_entry_rcu(d, node, head, d_hash) { - if (d == dentry) { - dget(dentry); + spin_lock(&dparent->d_lock); + list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) { + if (dentry == child) { + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + __dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&dparent->d_lock); return 1; } } - rcu_read_unlock(); + spin_unlock(&dparent->d_lock); + return 0; } EXPORT_SYMBOL(d_validate); @@ -1538,16 +2026,23 @@ EXPORT_SYMBOL(d_validate); void d_delete(struct dentry * dentry) { + struct inode *inode; int isdir = 0; /* * Are we the only user? */ - spin_lock(&dcache_lock); +again: spin_lock(&dentry->d_lock); - isdir = S_ISDIR(dentry->d_inode->i_mode); - if (atomic_read(&dentry->d_count) == 1) { + inode = dentry->d_inode; + isdir = S_ISDIR(inode->i_mode); + if (dentry->d_count == 1) { + if (inode && !spin_trylock(&inode->i_lock)) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + goto again; + } dentry->d_flags &= ~DCACHE_CANT_MOUNT; - dentry_iput(dentry); + dentry_unlink_inode(dentry); fsnotify_nameremove(dentry, isdir); return; } @@ -1556,17 +2051,18 @@ void d_delete(struct dentry * dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); fsnotify_nameremove(dentry, isdir); } EXPORT_SYMBOL(d_delete); -static void __d_rehash(struct dentry * entry, struct hlist_head *list) +static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b) { - + BUG_ON(!d_unhashed(entry)); + spin_lock_bucket(b); entry->d_flags &= ~DCACHE_UNHASHED; - hlist_add_head_rcu(&entry->d_hash, list); + hlist_bl_add_head_rcu(&entry->d_hash, &b->head); + spin_unlock_bucket(b); } static void _d_rehash(struct dentry * entry) @@ -1583,25 +2079,39 @@ static void _d_rehash(struct dentry * entry) void d_rehash(struct dentry * entry) { - spin_lock(&dcache_lock); spin_lock(&entry->d_lock); _d_rehash(entry); spin_unlock(&entry->d_lock); - spin_unlock(&dcache_lock); } EXPORT_SYMBOL(d_rehash); -/* - * When switching names, the actual string doesn't strictly have to - * be preserved in the target - because we're dropping the target - * anyway. As such, we can just do a simple memcpy() to copy over - * the new name before we switch. +/** + * dentry_update_name_case - update case insensitive dentry with a new name + * @dentry: dentry to be updated + * @name: new name * - * Note that we have to be a lot more careful about getting the hash - * switched - we have to switch the hash value properly even if it - * then no longer matches the actual (corrupted) string of the target. - * The hash value has to match the hash queue that the dentry is on.. + * Update a case insensitive dentry with new case of name. + * + * dentry must have been returned by d_lookup with name @name. Old and new + * name lengths must match (ie. no d_compare which allows mismatched name + * lengths). + * + * Parent inode i_mutex must be held over d_lookup and into this call (to + * keep renames and concurrent inserts, and readdir(2) away). */ +void dentry_update_name_case(struct dentry *dentry, struct qstr *name) +{ + BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ + + spin_lock(&dentry->d_lock); + write_seqcount_begin(&dentry->d_seq); + memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); + write_seqcount_end(&dentry->d_seq); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(dentry_update_name_case); + static void switch_names(struct dentry *dentry, struct dentry *target) { if (dname_external(target)) { @@ -1643,54 +2153,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target) swap(dentry->d_name.len, target->d_name.len); } +static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) +{ + /* + * XXXX: do we really need to take target->d_lock? + */ + if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent) + spin_lock(&target->d_parent->d_lock); + else { + if (d_ancestor(dentry->d_parent, target->d_parent)) { + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } else { + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&dentry->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } + } + if (target < dentry) { + spin_lock_nested(&target->d_lock, 2); + spin_lock_nested(&dentry->d_lock, 3); + } else { + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); + } +} + +static void dentry_unlock_parents_for_move(struct dentry *dentry, + struct dentry *target) +{ + if (target->d_parent != dentry->d_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (target->d_parent != target) + spin_unlock(&target->d_parent->d_lock); +} + /* - * We cannibalize "target" when moving dentry on top of it, - * because it's going to be thrown away anyway. We could be more - * polite about it, though. - * - * This forceful removal will result in ugly /proc output if - * somebody holds a file open that got deleted due to a rename. - * We could be nicer about the deleted file, and let it show - * up under the name it had before it was deleted rather than - * under the original name of the file that was moved on top of it. + * When switching names, the actual string doesn't strictly have to + * be preserved in the target - because we're dropping the target + * anyway. As such, we can just do a simple memcpy() to copy over + * the new name before we switch. + * + * Note that we have to be a lot more careful about getting the hash + * switched - we have to switch the hash value properly even if it + * then no longer matches the actual (corrupted) string of the target. + * The hash value has to match the hash queue that the dentry is on.. */ - /* - * d_move_locked - move a dentry + * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. */ -static void d_move_locked(struct dentry * dentry, struct dentry * target) +void d_move(struct dentry * dentry, struct dentry * target) { - struct hlist_head *list; - if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + BUG_ON(d_ancestor(dentry, target)); + BUG_ON(d_ancestor(target, dentry)); + write_seqlock(&rename_lock); - /* - * XXXX: do we really need to take target->d_lock? - */ - if (target < dentry) { - spin_lock(&target->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - } else { - spin_lock(&dentry->d_lock); - spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED); - } - /* Move the dentry to the target hash queue, if on different bucket */ - if (d_unhashed(dentry)) - goto already_unhashed; + dentry_lock_for_move(dentry, target); + + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin(&target->d_seq); - hlist_del_rcu(&dentry->d_hash); + /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ -already_unhashed: - list = d_hash(target->d_parent, target->d_name.hash); - __d_rehash(dentry, list); + /* + * Move the dentry to the target hash queue. Don't bother checking + * for the same hash queue because of how unlikely it is. + */ + __d_drop(dentry); + __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); /* Unhash the target: dput() will then get rid of it */ __d_drop(target); @@ -1715,27 +2255,16 @@ already_unhashed: } list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + + write_seqcount_end(&target->d_seq); + write_seqcount_end(&dentry->d_seq); + + dentry_unlock_parents_for_move(dentry, target); spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); write_sequnlock(&rename_lock); } - -/** - * d_move - move a dentry - * @dentry: entry to move - * @target: new dentry - * - * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. - */ - -void d_move(struct dentry * dentry, struct dentry * target) -{ - spin_lock(&dcache_lock); - d_move_locked(dentry, target); - spin_unlock(&dcache_lock); -} EXPORT_SYMBOL(d_move); /** @@ -1761,13 +2290,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex and the dcache_lock + * dentry->d_parent->d_inode->i_mutex and the inode->i_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ -static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) - __releases(dcache_lock) +static struct dentry *__d_unalias(struct inode *inode, + struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL, *m2 = NULL; struct dentry *ret; @@ -1790,10 +2319,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: - d_move_locked(alias, dentry); + d_move(alias, dentry); ret = alias; out_err: - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -1804,17 +2333,23 @@ out_err: /* * Prepare an anonymous dentry for life in the superblock's dentry tree as a * named dentry in place of the dentry to be replaced. + * returns with anon->d_lock held! */ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) { struct dentry *dparent, *aparent; - switch_names(dentry, anon); - swap(dentry->d_name.hash, anon->d_name.hash); + dentry_lock_for_move(anon, dentry); + + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin(&anon->d_seq); dparent = dentry->d_parent; aparent = anon->d_parent; + switch_names(dentry, anon); + swap(dentry->d_name.hash, anon->d_name.hash); + dentry->d_parent = (aparent == anon) ? dentry : aparent; list_del(&dentry->d_u.d_child); if (!IS_ROOT(dentry)) @@ -1829,6 +2364,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) else INIT_LIST_HEAD(&anon->d_u.d_child); + write_seqcount_end(&dentry->d_seq); + write_seqcount_end(&anon->d_seq); + + dentry_unlock_parents_for_move(anon, dentry); + spin_unlock(&dentry->d_lock); + + /* anon->d_lock still locked, returns locked */ anon->d_flags &= ~DCACHE_DISCONNECTED; } @@ -1846,14 +2388,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) BUG_ON(!d_unhashed(dentry)); - spin_lock(&dcache_lock); - if (!inode) { actual = dentry; __d_instantiate(dentry, NULL); - goto found_lock; + d_rehash(actual); + goto out_nolock; } + spin_lock(&inode->i_lock); + if (S_ISDIR(inode->i_mode)) { struct dentry *alias; @@ -1864,13 +2407,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) /* Is this an anonymous mountpoint that we could splice * into our tree? */ if (IS_ROOT(alias)) { - spin_lock(&alias->d_lock); __d_materialise_dentry(dentry, alias); __d_drop(alias); goto found; } /* Nope, but we must(!) avoid directory aliasing */ - actual = __d_unalias(dentry, alias); + actual = __d_unalias(inode, dentry, alias); if (IS_ERR(actual)) dput(alias); goto out_nolock; @@ -1881,15 +2423,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_instantiate_unique(dentry, inode); if (!actual) actual = dentry; - else if (unlikely(!d_unhashed(actual))) - goto shouldnt_be_hashed; + else + BUG_ON(!d_unhashed(actual)); -found_lock: spin_lock(&actual->d_lock); found: _d_rehash(actual); spin_unlock(&actual->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); out_nolock: if (actual == dentry) { security_d_instantiate(dentry, inode); @@ -1898,10 +2439,6 @@ out_nolock: iput(inode); return actual; - -shouldnt_be_hashed: - spin_unlock(&dcache_lock); - BUG(); } EXPORT_SYMBOL_GPL(d_materialise_unique); @@ -1921,14 +2458,13 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) } /** - * Prepend path string to a buffer - * + * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry (may be modified by this function) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * Caller holds the dcache_lock. + * Caller holds the rename_lock. * * If path is not reachable from the supplied root, then the value of * root is changed (without modifying refcounts). @@ -1956,7 +2492,9 @@ static int prepend_path(const struct path *path, struct path *root, } parent = dentry->d_parent; prefetch(parent); + spin_lock(&dentry->d_lock); error = prepend_name(buffer, buflen, &dentry->d_name); + spin_unlock(&dentry->d_lock); if (!error) error = prepend(buffer, buflen, "/", 1); if (error) @@ -2012,9 +2550,9 @@ char *__d_path(const struct path *path, struct path *root, int error; prepend(&res, &buflen, "\0", 1); - spin_lock(&dcache_lock); + write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); - spin_unlock(&dcache_lock); + write_sequnlock(&rename_lock); if (error) return ERR_PTR(error); @@ -2076,12 +2614,12 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); - spin_lock(&dcache_lock); + write_seqlock(&rename_lock); tmp = root; error = path_with_deleted(path, &tmp, &res, &buflen); if (error) res = ERR_PTR(error); - spin_unlock(&dcache_lock); + write_sequnlock(&rename_lock); path_put(&root); return res; } @@ -2107,12 +2645,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); - spin_lock(&dcache_lock); + write_seqlock(&rename_lock); tmp = root; error = path_with_deleted(path, &tmp, &res, &buflen); if (!error && !path_equal(&tmp, &root)) error = prepend_unreachable(&res, &buflen); - spin_unlock(&dcache_lock); + write_sequnlock(&rename_lock); path_put(&root); if (error) res = ERR_PTR(error); @@ -2144,7 +2682,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, /* * Write full pathname from the root of the filesystem into the buffer. */ -char *__dentry_path(struct dentry *dentry, char *buf, int buflen) +static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) { char *end = buf + buflen; char *retval; @@ -2158,10 +2696,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen) while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; + int error; prefetch(parent); - if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || - (prepend(&end, &buflen, "/", 1) != 0)) + spin_lock(&dentry->d_lock); + error = prepend_name(&end, &buflen, &dentry->d_name); + spin_unlock(&dentry->d_lock); + if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) goto Elong; retval = end; @@ -2171,14 +2712,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen) Elong: return ERR_PTR(-ENAMETOOLONG); } -EXPORT_SYMBOL(__dentry_path); + +char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +{ + char *retval; + + write_seqlock(&rename_lock); + retval = __dentry_path(dentry, buf, buflen); + write_sequnlock(&rename_lock); + + return retval; +} +EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(struct dentry *dentry, char *buf, int buflen) { char *p = NULL; char *retval; - spin_lock(&dcache_lock); + write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; if (prepend(&p, &buflen, "//deleted", 10) != 0) @@ -2186,12 +2738,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) buflen++; } retval = __dentry_path(dentry, buf, buflen); - spin_unlock(&dcache_lock); + write_sequnlock(&rename_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; Elong: - spin_unlock(&dcache_lock); return ERR_PTR(-ENAMETOOLONG); } @@ -2225,7 +2776,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; - spin_lock(&dcache_lock); + write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; struct path tmp = root; @@ -2234,7 +2785,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &tmp, &cwd, &buflen); - spin_unlock(&dcache_lock); + write_sequnlock(&rename_lock); if (error) goto out; @@ -2253,8 +2804,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) if (copy_to_user(buf, cwd, len)) error = -EFAULT; } - } else - spin_unlock(&dcache_lock); + } else { + write_sequnlock(&rename_lock); + } out: path_put(&pwd); @@ -2282,25 +2834,25 @@ out: int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { int result; - unsigned long seq; + unsigned seq; if (new_dentry == old_dentry) return 1; - /* - * Need rcu_readlock to protect against the d_parent trashing - * due to d_move - */ - rcu_read_lock(); do { /* for restarting inner loop in case of seq retry */ seq = read_seqbegin(&rename_lock); + /* + * Need rcu_readlock to protect against the d_parent trashing + * due to d_move + */ + rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) result = 1; else result = 0; + rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); - rcu_read_unlock(); return result; } @@ -2332,10 +2884,15 @@ EXPORT_SYMBOL(path_is_under); void d_genocide(struct dentry *root) { - struct dentry *this_parent = root; + struct dentry *this_parent; struct list_head *next; + unsigned seq; + int locked = 0; - spin_lock(&dcache_lock); + seq = read_seqbegin(&rename_lock); +again: + this_parent = root; + spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -2343,21 +2900,62 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - if (d_unhashed(dentry)||!dentry->d_inode) + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (d_unhashed(dentry) || !dentry->d_inode) { + spin_unlock(&dentry->d_lock); continue; + } if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } - atomic_dec(&dentry->d_count); + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_count--; + } + spin_unlock(&dentry->d_lock); } if (this_parent != root) { - next = this_parent->d_u.d_child.next; - atomic_dec(&this_parent->d_count); - this_parent = this_parent->d_parent; + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { + this_parent->d_flags |= DCACHE_GENOCIDE; + this_parent->d_count--; + } + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + /* might go back up the wrong parent if we have had a rename + * or deletion */ + if (this_parent != child->d_parent || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + rcu_read_unlock(); + next = child->d_u.d_child.next; goto resume; } - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + if (!locked && read_seqretry(&rename_lock, seq)) + goto rename_retry; + if (locked) + write_sequnlock(&rename_lock); + return; + +rename_retry: + locked = 1; + write_seqlock(&rename_lock); + goto again; } /** @@ -2411,7 +3009,7 @@ static void __init dcache_init_early(void) dentry_hashtable = alloc_large_system_hash("Dentry cache", - sizeof(struct hlist_head), + sizeof(struct dcache_hash_bucket), dhash_entries, 13, HASH_EARLY, @@ -2420,16 +3018,13 @@ static void __init dcache_init_early(void) 0); for (loop = 0; loop < (1 << d_hash_shift); loop++) - INIT_HLIST_HEAD(&dentry_hashtable[loop]); + INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head); } static void __init dcache_init(void) { int loop; - percpu_counter_init(&nr_dentry, 0); - percpu_counter_init(&nr_dentry_unused, 0); - /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature @@ -2446,7 +3041,7 @@ static void __init dcache_init(void) dentry_hashtable = alloc_large_system_hash("Dentry cache", - sizeof(struct hlist_head), + sizeof(struct dcache_hash_bucket), dhash_entries, 13, 0, @@ -2455,7 +3050,7 @@ static void __init dcache_init(void) 0); for (loop = 0; loop < (1 << d_hash_shift); loop++) - INIT_HLIST_HEAD(&dentry_hashtable[loop]); + INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head); } /* SLAB cache for __getname() consumers */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 85882f6ba5f..b044705eedd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error) } EXPORT_SYMBOL_GPL(dio_end_io); -static int +static void dio_bio_alloc(struct dio *dio, struct block_device *bdev, sector_t first_sector, int nr_vecs) { struct bio *bio; + /* + * bio_alloc() is guaranteed to return a bio when called with + * __GFP_WAIT and we request a valid number of vectors. + */ bio = bio_alloc(GFP_KERNEL, nr_vecs); bio->bi_bdev = bdev; @@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, dio->bio = bio; dio->logical_offset_in_bio = dio->cur_page_fs_offset; - return 0; } /* @@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector) goto out; sector = start_sector << (dio->blkbits - 9); nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); - ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); + dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); dio->boundary = 0; out: return ret; diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 2dbb422e811..1897eb1b4b6 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -1,8 +1,7 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" depends on EXPERIMENTAL && INET - depends on SYSFS && (IPV6 || IPV6=n) - select CONFIGFS_FS + depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP help A general purpose distributed lock manager for kernel or userspace diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 37a34c2c622..9c64ae9e4c1 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,6 +63,9 @@ #define NEEDED_RMEM (4*1024*1024) #define CONN_HASH_SIZE 32 +/* Number of messages to send before rescheduling */ +#define MAX_SEND_MSG_COUNT 25 + struct cbuf { unsigned int base; unsigned int len; @@ -108,6 +111,7 @@ struct connection { #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 +#define CF_APP_LIMITED 7 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ @@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk) { struct connection *con = sock2con(sk); - if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + if (!con) + return; + + clear_bit(SOCK_NOSPACE, &con->sock->flags); + + if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { + con->sock->sk->sk_write_pending--; + clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags); + } + + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } @@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con) struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; + int one = 1; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con) make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); log_print("connecting to %d", con->nodeid); + + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); + result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, O_NONBLOCK); @@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con, goto create_out; } + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); + result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&one, sizeof(one)); @@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con) const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset; + int count = 0; mutex_lock(&con->sock_mutex); if (con->sock == NULL) @@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con) ret = kernel_sendpage(con->sock, e->page, offset, len, msg_flags); if (ret == -EAGAIN || ret == 0) { + if (ret == -EAGAIN && + test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->flags); + con->sock->sk->sk_write_pending++; + } cond_resched(); goto out; } if (ret <= 0) goto send_error; } - /* Don't starve people filling buffers */ + + /* Don't starve people filling buffers */ + if (++count >= MAX_SEND_MSG_COUNT) { cond_resched(); + count = 0; + } spin_lock(&con->writequeue_lock); e->offset += ret; @@ -1430,20 +1468,19 @@ static void work_stop(void) static int work_start(void) { - int error; - recv_workqueue = create_workqueue("dlm_recv"); - error = IS_ERR(recv_workqueue); - if (error) { - log_print("can't start dlm_recv %d", error); - return error; + recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_FREEZEABLE, 0); + if (!recv_workqueue) { + log_print("can't start dlm_recv"); + return -ENOMEM; } - send_workqueue = create_singlethread_workqueue("dlm_send"); - error = IS_ERR(send_workqueue); - if (error) { - log_print("can't start dlm_send %d", error); + send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_FREEZEABLE, 0); + if (!send_workqueue) { + log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); - return error; + return -ENOMEM; } return 0; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index cbadc1bee6e..bfd8b680e64 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, BUG_ON(!crypt_stat || !crypt_stat->tfm || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", + ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); ecryptfs_dump_hex(crypt_stat->key, crypt_stat->key_size); @@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, rc = ecryptfs_derive_iv(extent_iv, crypt_stat, (extent_base + extent_offset)); if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to " - "derive IV for extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); goto out; } if (unlikely(ecryptfs_verbosity > 0)) { @@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, } rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; " + "rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " "encryption:\n"); ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); @@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page, rc = ecryptfs_derive_iv(extent_iv, crypt_stat, (extent_base + extent_offset)); if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to " - "derive IV for extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); goto out; } if (unlikely(ecryptfs_verbosity > 0)) { @@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page, } rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; " + "rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " "decryption:\n"); ecryptfs_dump_hex((char *)(page_address(page) @@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) } ecryptfs_printk(KERN_DEBUG, "Initializing cipher [%s]; strlen = [%d]; " - "key_size_bits = [%d]\n", + "key_size_bits = [%zd]\n", crypt_stat->cipher, (int)strlen(crypt_stat->cipher), crypt_stat->key_size << 3); if (crypt_stat->tfm) { diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 906e803f7f7..6fc4f319b55 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -44,12 +44,17 @@ */ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; struct dentry *dentry_save; struct vfsmount *vfsmount_save; int rc = 1; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; dentry_save = nd->path.dentry; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 413a3c48f0b..dbc84ed9633 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key) (((struct user_key_payload*)key->payload.data)->data); } -#define ECRYPTFS_SUPER_MAGIC 0xf15f #define ECRYPTFS_MAX_KEYSET_SIZE 1024 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 @@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); +__attribute__ ((format(printf, 1, 2))) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 91da02987bf..81e10e6a944 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - int rc; + ssize_t rc; struct dentry *lower_dentry; struct vfsmount *lower_vfsmount; struct file *file = iocb->ki_filp; @@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - if (!ecryptfs_inode_to_private(inode)->lower_file) { - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out_free; - } + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out_free; } - if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) - && !(file->f_flags & O_RDONLY)) { + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) + == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) { rc = -EPERM; printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " "file must hence be opened RO\n", __func__); @@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } } mutex_unlock(&crypt_stat->cs_mutex); - ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " - "size: [0x%.16x]\n", inode, inode->i_ino, - i_size_read(inode)); + ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " + "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + (unsigned long long)i_size_read(inode)); goto out; out_free: kmem_cache_free(ecryptfs_file_info_cache, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 9d1a22d6276..bd33f87a190 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context; rc = [%d]\n", rc); goto out; } - if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out; - } + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; } rc = ecryptfs_write_metadata(ecryptfs_dentry); if (rc) { @@ -260,7 +258,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, ecryptfs_dentry->d_parent)); lower_inode = lower_dentry->d_inode; fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); - BUG_ON(!atomic_read(&lower_dentry->d_count)); + BUG_ON(!lower_dentry->d_count); ecryptfs_set_dentry_private(ecryptfs_dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL)); @@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, rc = -ENOMEM; goto out; } - if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out_free_kmem; - } + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out_free_kmem; } crypt_stat = &ecryptfs_inode_to_private( ecryptfs_dentry->d_inode)->crypt_stat; @@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, struct qstr lower_name; int rc = 0; - ecryptfs_dentry->d_op = &ecryptfs_dops; if ((ecryptfs_dentry->d_name.len == 1 && !strcmp(ecryptfs_dentry->d_name.name, ".")) || (ecryptfs_dentry->d_name.len == 2 @@ -454,7 +449,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, lower_name.hash = ecryptfs_dentry->d_name.hash; if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, - &lower_name); + lower_dir_dentry->d_inode, &lower_name); if (rc < 0) goto out_d_drop; } @@ -489,7 +484,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, lower_name.hash = full_name_hash(lower_name.name, lower_name.len); if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, - &lower_name); + lower_dir_dentry->d_inode, &lower_name); if (rc < 0) goto out_d_drop; } @@ -980,8 +975,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) } static int -ecryptfs_permission(struct inode *inode, int mask) +ecryptfs_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; return inode_permission(ecryptfs_inode_to_lower(inode), mask); } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index b1f6858a522..c1436cff6f2 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -59,7 +59,7 @@ static int process_request_key_err(long err_code) break; default: ecryptfs_printk(KERN_WARNING, "Unknown error code: " - "[0x%.16x]\n", err_code); + "[0x%.16lx]\n", err_code); rc = -EINVAL; } return rc; @@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size, } else { rc = -EINVAL; ecryptfs_printk(KERN_WARNING, - "Unsupported packet size: [%d]\n", size); + "Unsupported packet size: [%zd]\n", size); } return rc; } @@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, auth_tok->session_key.decrypted_key_size); crypt_stat->flags |= ECRYPTFS_KEY_VALID; if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n", + ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n", crypt_stat->key_size); ecryptfs_dump_hex(crypt_stat->key, crypt_stat->key_size); @@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { ecryptfs_printk(KERN_ERR, "Expected " "signature of size [%d]; " - "read size [%d]\n", + "read size [%zd]\n", ECRYPTFS_SIG_SIZE, tag_11_contents_size); rc = -EIO; @@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, goto out_wipe_list; break; default: - ecryptfs_printk(KERN_DEBUG, "No packet at offset " - "[%d] of the file header; hex value of " + ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " + "of the file header; hex value of " "character is [0x%.2x]\n", i, src[i]); next_packet_is_auth_tok_packet = 0; } @@ -1864,8 +1864,8 @@ found_matching_auth_tok: "session key for authentication token with sig " "[%.*s]; rc = [%d]. Removing auth tok " "candidate from the list and searching for " - "the next match.\n", candidate_auth_tok_sig, - ECRYPTFS_SIG_SIZE_HEX, rc); + "the next match.\n", ECRYPTFS_SIG_SIZE_HEX, + candidate_auth_tok_sig, rc); list_for_each_entry_safe(auth_tok_list_item, auth_tok_list_item_tmp, &auth_tok_list, list) { @@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, if (encrypted_session_key_valid) { ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " "using auth_tok->session_key.encrypted_key, " - "where key_rec->enc_key_size = [%d]\n", + "where key_rec->enc_key_size = [%zd]\n", key_rec->enc_key_size); memcpy(key_rec->enc_key, auth_tok->session_key.encrypted_key, @@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, if (rc < 1 || rc > 2) { ecryptfs_printk(KERN_ERR, "Error generating scatterlist " "for crypt_stat session key; expected rc = 1; " - "got rc = [%d]. key_rec->enc_key_size = [%d]\n", + "got rc = [%d]. key_rec->enc_key_size = [%zd]\n", rc, key_rec->enc_key_size); rc = -ENOMEM; goto out; @@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, ecryptfs_printk(KERN_ERR, "Error generating scatterlist " "for crypt_stat encrypted session key; " "expected rc = 1; got rc = [%d]. " - "key_rec->enc_key_size = [%d]\n", rc, + "key_rec->enc_key_size = [%zd]\n", rc, key_rec->enc_key_size); rc = -ENOMEM; goto out; @@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, goto out; } rc = 0; - ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", + ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n", crypt_stat->key_size); rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, (*key_rec).enc_key_size); @@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, } ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); if (ecryptfs_verbosity > 0) { - ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n", + ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n", key_rec->enc_key_size); ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index a9dbd62518e..758323a0f09 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -36,6 +36,7 @@ #include <linux/parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> +#include <linux/magic.h> #include "ecryptfs_kernel.h" /** @@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) return rc; } -/** - * ecryptfs_interpose - * @lower_dentry: Existing dentry in the lower filesystem - * @dentry: ecryptfs' dentry - * @sb: ecryptfs's super_block - * @flags: flags to govern behavior of interpose procedure - * - * Interposes upper and lower dentries. - * - * Returns zero on success; non-zero otherwise - */ -int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, - struct super_block *sb, u32 flags) +static struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) { - struct inode *lower_inode; struct inode *inode; int rc = 0; - lower_inode = lower_dentry->d_inode; if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { rc = -EXDEV; goto out; @@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, if (special_file(lower_inode->i_mode)) init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); - dentry->d_op = &ecryptfs_dops; fsstack_copy_attr_all(inode, lower_inode); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); + return inode; +out: + return ERR_PTR(rc); +} + +/** + * ecryptfs_interpose + * @lower_dentry: Existing dentry in the lower filesystem + * @dentry: ecryptfs' dentry + * @sb: ecryptfs's super_block + * @flags: flags to govern behavior of interpose procedure + * + * Interposes upper and lower dentries. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, + struct super_block *sb, u32 flags) +{ + struct inode *lower_inode = lower_dentry->d_inode; + struct inode *inode = ecryptfs_get_inode(lower_inode, sb); + if (IS_ERR(inode)) + return PTR_ERR(inode); if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) d_add(dentry, inode); else d_instantiate(dentry, inode); -out: - return rc; + return 0; } enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, @@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; /** - * ecryptfs_read_super - * @sb: The ecryptfs super block - * @dev_name: The path to mount over - * - * Read the super block of the lower filesystem, and use - * ecryptfs_interpose to create our initial inode and super block - * struct. - */ -static int ecryptfs_read_super(struct super_block *sb, const char *dev_name) -{ - struct path path; - int rc; - - rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); - if (rc) { - ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); - goto out; - } - if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { - rc = -EINVAL; - printk(KERN_ERR "Mount on filesystem of type " - "eCryptfs explicitly disallowed due to " - "known incompatibilities\n"); - goto out_free; - } - ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); - sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; - sb->s_blocksize = path.dentry->d_sb->s_blocksize; - ecryptfs_set_dentry_lower(sb->s_root, path.dentry); - ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt); - rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0); - if (rc) - goto out_free; - rc = 0; - goto out; -out_free: - path_put(&path); -out: - return rc; -} - -/** * ecryptfs_get_sb * @fs_type * @flags * @dev_name: The path to mount over * @raw_data: The options passed into the kernel - * - * The whole ecryptfs_get_sb process is broken into 3 functions: - * ecryptfs_parse_options(): handle options passed to ecryptfs, if any - * ecryptfs_read_super(): this accesses the lower filesystem and uses - * ecryptfs_interpose to perform most of the linking - * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) */ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) @@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags struct ecryptfs_sb_info *sbi; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; + struct inode *inode; + struct path path; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); @@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_flags = flags; rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); - if (rc) { - deactivate_locked_super(s); - goto out; - } + if (rc) + goto out1; ecryptfs_set_superblock_private(s, sbi); s->s_bdi = &sbi->bdi; @@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; s->s_op = &ecryptfs_sops; + s->s_d_op = &ecryptfs_dops; - rc = -ENOMEM; - s->s_root = d_alloc(NULL, &(const struct qstr) { - .hash = 0,.name = "/",.len = 1}); + err = "Reading sb failed"; + rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); + if (rc) { + ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); + goto out1; + } + if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { + rc = -EINVAL; + printk(KERN_ERR "Mount on filesystem of type " + "eCryptfs explicitly disallowed due to " + "known incompatibilities\n"); + goto out_free; + } + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); + s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + s->s_blocksize = path.dentry->d_sb->s_blocksize; + s->s_magic = ECRYPTFS_SUPER_MAGIC; + + inode = ecryptfs_get_inode(path.dentry->d_inode, s); + rc = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_free; + + s->s_root = d_alloc_root(inode); if (!s->s_root) { - deactivate_locked_super(s); - goto out; + iput(inode); + rc = -ENOMEM; + goto out_free; } - s->s_root->d_op = &ecryptfs_dops; - s->s_root->d_sb = s; - s->s_root->d_parent = s->s_root; + rc = -ENOMEM; root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!root_info) { - deactivate_locked_super(s); - goto out; - } + if (!root_info) + goto out_free; + /* ->kill_sb() will take care of root_info */ ecryptfs_set_dentry_private(s->s_root, root_info); + ecryptfs_set_dentry_lower(s->s_root, path.dentry); + ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt); + s->s_flags |= MS_ACTIVE; - rc = ecryptfs_read_super(s, dev_name); - if (rc) { - deactivate_locked_super(s); - err = "Reading sb failed"; - goto out; - } return dget(s->s_root); +out_free: + path_put(&path); +out1: + deactivate_locked_super(s); out: if (sbi) { ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); @@ -828,9 +810,10 @@ static int __init ecryptfs_init(void) ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " "eCryptfs cannot run on this system. The " - "default eCryptfs extent size is [%d] bytes; " - "the page size is [%d] bytes.\n", - ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); + "default eCryptfs extent size is [%u] bytes; " + "the page size is [%lu] bytes.\n", + ECRYPTFS_DEFAULT_EXTENT_SIZE, + (unsigned long)PAGE_CACHE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index b1d82756544..cc64fca89f8 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting " - "page (upper index [0x%.16x])\n", page->index); + "page (upper index [0x%.16lx])\n", page->index); ClearPageUptodate(page); goto out; } @@ -237,7 +237,7 @@ out: ClearPageUptodate(page); else SetPageUptodate(page); - ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", + ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n", page->index); unlock_page(page); return rc; @@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file, return -ENOMEM; *pagep = page; + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; @@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file, SetPageUptodate(page); } } else { - rc = ecryptfs_decrypt_page(page); - if (rc) { - printk(KERN_ERR "%s: Error decrypting page " - "at index [%ld]; rc = [%d]\n", - __func__, page->index, rc); - ClearPageUptodate(page); - goto out; + if (prev_page_end_size + >= i_size_read(page->mapping->host)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + } else { + rc = ecryptfs_decrypt_page(page); + if (rc) { + printk(KERN_ERR "%s: Error decrypting " + "page at index [%ld]; " + "rc = [%d]\n", + __func__, page->index, rc); + ClearPageUptodate(page); + goto out; + } } SetPageUptodate(page); } } - prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ if (index != 0) { @@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file, } else ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" - "(page w/ index = [0x%.16x], to = [%d])\n", index, to); + "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, to); @@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file, rc = fill_zeros_to_end_of_page(page, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " - "zeros in page with index = [0x%.16x]\n", index); + "zeros in page with index = [0x%.16lx]\n", index); goto out; } rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " - "index [0x%.16x])\n", index); + "index [0x%.16lx])\n", index); goto out; } if (pos + copied > i_size_read(ecryptfs_inode)) { i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " - "[0x%.16x]\n", i_size_read(ecryptfs_inode)); + "[0x%.16llx]\n", + (unsigned long long)i_size_read(ecryptfs_inode)); } rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); if (rc) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2720178b771..3042fe123a3 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -62,6 +62,16 @@ out: return inode; } +static void ecryptfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ecryptfs_inode_info *inode_info; + inode_info = ecryptfs_inode_to_private(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ecryptfs_inode_info_cache, inode_info); +} + /** * ecryptfs_destroy_inode * @inode: The ecryptfs inode @@ -88,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode) } } ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); - kmem_cache_free(ecryptfs_inode_info_cache, inode_info); + call_rcu(&inode->i_rcu, ecryptfs_i_callback); } /** diff --git a/fs/efs/super.c b/fs/efs/super.c index 5073a07652c..0f31acb0131 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void efs_destroy_inode(struct inode *inode) +static void efs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } +static void efs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, efs_i_callback); +} + static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8cf07242067..cc8a9b7d606 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -217,7 +217,7 @@ struct ep_send_events_data { * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ -static int max_user_watches __read_mostly; +static long max_user_watches __read_mostly; /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). @@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly; #include <linux/sysctl.h> -static int zero; +static long zero; +static long long_max = LONG_MAX; ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, - .maxlen = sizeof(int), + .maxlen = sizeof(max_user_watches), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, + .extra2 = &long_max, }, { } }; @@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); - atomic_dec(&ep->user->epoll_watches); + atomic_long_dec(&ep->user->epoll_watches); return 0; } @@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, { int error, revents, pwake = 0; unsigned long flags; + long user_watches; struct epitem *epi; struct ep_pqueue epq; - if (unlikely(atomic_read(&ep->user->epoll_watches) >= - max_user_watches)) + user_watches = atomic_long_read(&ep->user->epoll_watches); + if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; @@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, spin_unlock_irqrestore(&ep->lock, flags); - atomic_inc(&ep->user->epoll_watches); + atomic_long_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) @@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void) */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; + BUG_ON(max_user_watches < 0); /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 79c3ae6e045..8c6c4669b38 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } +static void exofs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); +} + /* * Remove an inode from the cache */ static void exofs_destroy_inode(struct inode *inode) { - kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); + call_rcu(&inode->i_rcu, exofs_i_callback); } /* diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 51b304056f1..4b6825740dd 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result, void *context) { struct dentry *dentry, *toput = NULL; + struct inode *inode; if (acceptable(context, result)) return result; - spin_lock(&dcache_lock); - list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { - dget_locked(dentry); - spin_unlock(&dcache_lock); + inode = result->d_inode; + spin_lock(&inode->i_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + dget(dentry); + spin_unlock(&inode->i_lock); if (toput) dput(toput); if (dentry != result && acceptable(context, dentry)) { dput(result); return dentry; } - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); toput = dentry; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); if (toput) dput(toput); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 2bcc0431bad..7b4180554a6 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) } int -ext2_check_acl(struct inode *inode, int mask) +ext2_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3ff6cbb9ac4..c939b7b1209 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_check_acl (struct inode *, int); +extern int ext2_check_acl (struct inode *, int, unsigned int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2709b34206a..47cda410b54 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -28,21 +28,30 @@ typedef struct ext2_dir_entry_2 ext2_dirent; +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ static inline unsigned ext2_rec_len_from_disk(__le16 dlen) { unsigned len = le16_to_cpu(dlen); +#if (PAGE_CACHE_SIZE >= 65536) if (len == EXT2_MAX_REC_LEN) return 1 << 16; +#endif return len; } static inline __le16 ext2_rec_len_to_disk(unsigned len) { +#if (PAGE_CACHE_SIZE >= 65536) if (len == (1 << 16)) return cpu_to_le16(EXT2_MAX_REC_LEN); else BUG_ON(len > (1 << 16)); +#endif return cpu_to_le16(len); } @@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet) p = (ext2_dirent *)(kaddr + offs); rec_len = ext2_rec_len_from_disk(p->rec_len); - if (rec_len < EXT2_DIR_REC_LEN(1)) + if (unlikely(rec_len < EXT2_DIR_REC_LEN(1))) goto Eshort; - if (rec_len & 3) + if (unlikely(rec_len & 3)) goto Ealign; - if (rec_len < EXT2_DIR_REC_LEN(p->name_len)) + if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len))) goto Enamelen; - if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))) goto Espan; - if (le32_to_cpu(p->inode) > max_inumber) + if (unlikely(le32_to_cpu(p->inode) > max_inumber)) goto Einumber; } if (offs != limit) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index f8aecd2e329..2e1d8341d82 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str inode = NULL; if (ino) { inode = ext2_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { + if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ESTALE) { ext2_error(dir->i_sb, __func__, "deleted inode referenced: %lu", diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d89e0b6a2d7..7731695e65d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); -void ext2_error (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext2_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; @@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function, } va_start(args, fmt); - printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); if (test_opt(sb, ERRORS_PANIC)) @@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function, void ext2_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk("%sEXT2-fs (%s): ", prefix, sb->s_id); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); } @@ -161,11 +170,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ext2_destroy_inode(struct inode *inode) +static void ext2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } +static void ext2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ext2_i_callback); +} + static void init_once(void *foo) { struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index f84700be327..c2e4dce984d 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -199,14 +199,6 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - /* Check the remaining name entries */ - while (!IS_LAST_ENTRY(entry)) { - struct ext2_xattr_entry *next = - EXT2_XATTR_NEXT(entry); - if ((char *)next >= end) - goto bad_block; - entry = next; - } if (ext2_xattr_cache_insert(bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; @@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb) /* * ext2_xattr_set() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 8a11fe21218..e4fa49e6c53 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext3_check_acl(struct inode *inode, int mask) +ext3_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 597334626de..5faf8048e90 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int); +extern int ext3_check_acl (struct inode *, int, unsigned int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index b3db2264942..045995c8ce5 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -20,6 +20,7 @@ #include <linux/ext3_jbd.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> /* * balloc.c contains the blocks allocation and deallocation routines @@ -39,6 +40,21 @@ #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +/* + * Calculate the block group number and offset, given a block number + */ +static void ext3_get_group_no_and_offset(struct super_block *sb, + ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + if (offsetp) + *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); + if (blockgrpp) + *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); +} + /** * ext3_get_group_desc() -- load group descriptor from disk * @sb: super block @@ -1885,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) return ext3_bg_num_gdb_meta(sb,group); } + +/** + * ext3_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: allocation group to trim + * @start: first group block to examine + * @max: last group block to examine + * @gdp: allocation group description structure + * @minblocks: minimum extent block count + * + * ext3_trim_all_free walks through group's block bitmap searching for free + * blocks. When the free block is found, it tries to allocate this block and + * consequent free block to get the biggest free extent possible, until it + * reaches any used block. Then issue a TRIM command on this extent and free + * the extent in the block bitmap. This is done until whole group is scanned. + */ +ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) +{ + handle_t *handle; + ext3_grpblk_t next, free_blocks, bit, freed, count = 0; + ext3_fsblk_t discard_block; + struct ext3_sb_info *sbi; + struct buffer_head *gdp_bh, *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + int err = 0, ret = 0; + + /* + * We will update one block bitmap, and one group descriptor + */ + handle = ext3_journal_start_sb(sb, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + bitmap_bh = read_block_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto err_out; + + gdp = ext3_get_group_desc(sb, group, &gdp_bh); + if (!gdp) { + err = -EIO; + goto err_out; + } + + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto err_out; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + sbi = EXT3_SB(sb); + + /* Walk through the whole group */ + while (start < max) { + start = bitmap_search_next_usable_block(start, bitmap_bh, max); + if (start < 0) + break; + next = start; + + /* + * Allocate contiguous free extents by setting bits in the + * block bitmap + */ + while (next < max + && claim_block(sb_bgl_lock(sbi, group), + next, bitmap_bh)) { + next++; + } + + /* We did not claim any blocks */ + if (next == start) + continue; + + discard_block = (ext3_fsblk_t)start + + ext3_group_first_block_no(sb, group); + + /* Update counters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, start - next); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + + /* Do not issue a TRIM on extents smaller than minblocks */ + if ((next - start) < minblocks) + goto free_extent; + + /* Send the TRIM command down to the device */ + err = sb_issue_discard(sb, discard_block, next - start, + GFP_NOFS, 0); + count += (next - start); +free_extent: + freed = 0; + + /* + * Clear bits in the bitmap + */ + for (bit = start; bit < next; bit++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), + bit, bitmap_bh->b_data)) { + ext3_error(sb, __func__, + "bit already cleared for block "E3FSBLK, + (unsigned long)bit); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + freed++; + } + } + + /* Update couters */ + spin_lock(sb_bgl_lock(sbi, group)); + le16_add_cpu(&gdp->bg_free_blocks_count, freed); + spin_unlock(sb_bgl_lock(sbi, group)); + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + + start = next; + if (err < 0) { + if (err != -EOPNOTSUPP) + ext3_warning(sb, __func__, "Discard command " + "returned error %d\n", err); + break; + } + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + cond_resched(); + + /* No more suitable extents */ + if ((free_blocks - count) < minblocks) + break; + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + ret = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = ret; + + /* And the group descriptor block */ + BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!err) + err = ret; + + ext3_debug("trimmed %d blocks in the group %d\n", + count, group); + +err_out: + if (err) + count = err; + ext3_journal_stop(handle); + brelse(bitmap_bh); + + return count; +} + +/** + * ext3_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @start: First Byte to trim + * @len: number of Bytes to trim from start + * @minlen: minimum extent length in Bytes + * + * ext3_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext3_trim_all_free function + * is invoked to trim all free space. + */ +int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + ext3_grpblk_t last_block, first_block, free_blocks; + unsigned long first_group, last_group; + unsigned long group, ngroups; + struct ext3_group_desc *gdp; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + uint64_t start, len, minlen, trimmed; + ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + len = range->len >> sb->s_blocksize_bits; + minlen = range->minlen >> sb->s_blocksize_bits; + trimmed = 0; + + if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) + return -EINVAL; + if (start >= max_blks) + goto out; + if (start < le32_to_cpu(es->s_first_data_block)) { + len -= le32_to_cpu(es->s_first_data_block) - start; + start = le32_to_cpu(es->s_first_data_block); + } + if (start + len > max_blks) + len = max_blks - start; + + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + + /* Determine first and last group to examine based on start and len */ + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, + &first_group, &first_block); + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len), + &last_group, &last_block); + last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; + last_block = EXT3_BLOCKS_PER_GROUP(sb); + + if (first_group > last_group) + return -EINVAL; + + for (group = first_group; group <= last_group; group++) { + gdp = ext3_get_group_desc(sb, group, NULL); + if (!gdp) + break; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks < minlen) + continue; + + if (len >= EXT3_BLOCKS_PER_GROUP(sb)) + len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block); + else + last_block = first_block + len; + + ret = ext3_trim_all_free(sb, group, first_block, + last_block, minlen); + if (ret < 0) + break; + + trimmed += ret; + first_block = 0; + } + + if (ret >= 0) + ret = 0; + +out: + range->len = trimmed * sb->s_blocksize; + + return ret; +} diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index e2e72c367cf..34f0a072b93 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir, const char * error_msg = NULL; const int rlen = ext3_rec_len_from_disk(de->rec_len); - if (rlen < EXT3_DIR_REC_LEN(1)) + if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) error_msg = "directory entry across blocks"; - else if (le32_to_cpu(de->inode) > - le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; - if (error_msg != NULL) + if (unlikely(error_msg != NULL)) ext3_error (dir->i_sb, function, "bad entry in directory #%lu: %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, (unsigned long) le32_to_cpu(de->inode), rlen, de->name_len); + return error_msg == NULL ? 1 : 0; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index a9580617edd..ae94f6d949f 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2145,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + if (ext3_journal_dirty_metadata(handle, bh)) + return; } ext3_mark_inode_dirty(handle, inode); truncate_restart_transaction(handle, inode); if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ext3_journal_get_write_access(handle, bh); + if (ext3_journal_get_write_access(handle, bh)) + return; } } diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 88974814783..fc080dd561f 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -276,7 +276,29 @@ group_add_out: mnt_drop_write(filp->f_path.mnt); return err; } + case FITRIM: { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext3_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } default: return -ENOTTY; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index bce9dce639b..b27ba71810e 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir, struct buffer_head * bh_use[NAMEI_RA_SIZE]; struct buffer_head * bh, *ret = NULL; unsigned long start, block, b; + const u8 *name = entry->name; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ int ra_ptr = 0; /* Current index into readahead @@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir, namelen = entry->len; if (namelen > EXT3_NAME_LEN) return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == 0)) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } if (is_dx(dir)) { bh = ext3_dx_find_entry(dir, entry, res_dir, &err); /* @@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, struct qstr *entry, struct ext3_dir_entry_2 **res_dir, int *err) { - struct super_block * sb; + struct super_block *sb = dir->i_sb; struct dx_hash_info hinfo; - u32 hash; struct dx_frame frames[2], *frame; - struct ext3_dir_entry_2 *de, *top; struct buffer_head *bh; unsigned long block; int retval; - int namelen = entry->len; - const u8 *name = entry->name; - sb = dir->i_sb; - /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) { - if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) - return NULL; - } else { - frame = frames; - frame->bh = NULL; /* for dx_release() */ - frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ - dx_set_block(frame->at, 0); /* dx_root block is 0 */ - } - hash = hinfo.hash; + if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) + return NULL; do { block = dx_get_block(frame->at); if (!(bh = ext3_bread (NULL,dir, block, 0, err))) goto errout; - de = (struct ext3_dir_entry_2 *) bh->b_data; - top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) { - int off = (block << EXT3_BLOCK_SIZE_BITS(sb)) - + ((char *) de - bh->b_data); - - if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) { - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto errout; - } - if (ext3_match(namelen, name, de)) { - *res_dir = de; - dx_release(frames); - return bh; - } + retval = search_dirblock(bh, dir, entry, + block << EXT3_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { + dx_release(frames); + return bh; } - brelse (bh); + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + /* Check to see if we should continue to search */ - retval = ext3_htree_next_block(dir, hash, frame, + retval = ext3_htree_next_block(dir, hinfo.hash, frame, frames, NULL); if (retval < 0) { ext3_warning(sb, __func__, @@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str return ERR_PTR(-EIO); } inode = ext3_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { + if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ESTALE) { ext3_error(dir->i_sb, __func__, "deleted inode referenced: %lu", @@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext3_journal_dirty_metadata(handle, frames[0].bh); + err = ext3_journal_dirty_metadata(handle, frames[0].bh); + if (err) + goto journal_error; } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle, if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) return -EIO; if (de == de_del) { + int err; + BUFFER_TRACE(bh, "get_write_access"); - ext3_journal_get_write_access(handle, bh); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + if (pde) pde->rec_len = ext3_rec_len_to_disk( ext3_rec_len_from_disk(pde->rec_len) + @@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) { +journal_error: + ext3_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext3_rec_len_from_disk(de->rec_len); @@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) { handle_t *handle; struct inode * inode; - struct buffer_head * dir_block; + struct buffer_head * dir_block = NULL; struct ext3_dir_entry_2 * de; int err, retries = 0; @@ -1790,15 +1793,14 @@ retry: inode->i_fop = &ext3_dir_operations; inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - drop_nlink(inode); /* is this nlink == 0? */ - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } + if (!dir_block) + goto out_clear_inode; + BUFFER_TRACE(dir_block, "get_write_access"); - ext3_journal_get_write_access(handle, dir_block); + err = ext3_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + de = (struct ext3_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; @@ -1814,11 +1816,16 @@ retry: ext3_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); + err = ext3_journal_dirty_metadata(handle, dir_block); + if (err) + goto out_clear_inode; + + err = ext3_mark_inode_dirty(handle, inode); + if (!err) + err = ext3_add_entry (handle, dentry, inode); + if (err) { +out_clear_inode: inode->i_nlink = 0; unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); @@ -1827,10 +1834,14 @@ retry: } inc_nlink(dir); ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); + err = ext3_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + d_instantiate(dentry, inode); unlock_new_inode(inode); out_stop: + brelse(dir_block); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; } else { BUFFER_TRACE(new_bh, "get write access"); - ext3_journal_get_write_access(handle, new_bh); + retval = ext3_journal_get_write_access(handle, new_bh); + if (retval) + goto journal_error; new_de->inode = cpu_to_le32(old_inode->i_ino); if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) @@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, new_bh); + retval = ext3_journal_dirty_metadata(handle, new_bh); + if (retval) + goto journal_error; brelse(new_bh); new_bh = NULL; } @@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, ext3_update_dx_flag(old_dir); if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); + retval = ext3_journal_get_write_access(handle, dir_bh); + if (retval) + goto journal_error; PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_bh); + retval = ext3_journal_dirty_metadata(handle, dir_bh); + if (retval) { +journal_error: + ext3_std_error(new_dir->i_sb, retval); + goto end_rename; + } drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index e746d30b123..108b142e11e 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext3_journal_dirty_metadata(handle, gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } ext3_set_bit(bit, bh->b_data); brelse(gdb); } @@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(gdb); goto exit_bh; } - ext3_journal_dirty_metadata(handle, gdb); + err = ext3_journal_dirty_metadata(handle, gdb); + if (err) { + brelse(gdb); + goto exit_bh; + } ext3_set_bit(bit, bh->b_data); brelse(gdb); } @@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(it); goto exit_bh; } - ext3_journal_dirty_metadata(handle, it); + err = ext3_journal_dirty_metadata(handle, it); + if (err) { + brelse(it); + goto exit_bh; + } brelse(it); ext3_set_bit(bit, bh->b_data); } @@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), bh->b_data); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto exit_bh; brelse(bh); /* Mark unused entries in inode bitmap used */ @@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), bh->b_data); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); exit_bh: brelse(bh); @@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; - ext3_journal_dirty_metadata(handle, dind); + err = ext3_journal_dirty_metadata(handle, dind); + if (err) + goto exit_group_desc; brelse(dind); + dind = NULL; inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; - ext3_mark_iloc_dirty(handle, inode, &iloc); + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto exit_group_desc; memset((*primary)->b_data, 0, sb->s_blocksize); - ext3_journal_dirty_metadata(handle, *primary); + err = ext3_journal_dirty_metadata(handle, *primary); + if (err) + goto exit_group_desc; o_group_desc = EXT3_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto exit_inode; return 0; +exit_group_desc: + kfree(n_group_desc); exit_inode: //ext3_journal_release_buffer(handle, iloc.bh); brelse(iloc.bh); @@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb, } ext3_debug("update metadata backup %#04lx\n", (unsigned long)bh->b_blocknr); - if ((err = ext3_journal_get_write_access(handle, bh))) + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext3_journal_dirty_metadata(handle, bh); + err = ext3_journal_dirty_metadata(handle, bh); brelse(bh); + if (err) + break; } if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext3_journal_dirty_metadata(handle, primary); + err = ext3_journal_dirty_metadata(handle, primary); + if (err) + goto exit_journal; /* Update the reserved block counts only once the new group is * active. */ @@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT3_INODES_PER_GROUP(sb)); - ext3_journal_dirty_metadata(handle, sbi->s_sbh); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: mutex_unlock(&sbi->s_resize_lock); @@ -1064,8 +1095,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); mutex_unlock(&EXT3_SB(sb)->s_resize_lock); + if (err) { + ext3_warning(sb, __func__, + "error %d on journal dirty metadata", err); + ext3_journal_stop(handle); + goto exit_put; + } ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index acf8695fa8f..85c8cc8f247 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -143,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn, void ext3_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk("%sEXT3-fs (%s): ", prefix, sb->s_id); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); } @@ -195,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb) sb->s_id); } -void ext3_error (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); ext3_handle_error(sb); @@ -274,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function, * case we take the easy way out and panic immediately. */ -void ext3_abort (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_abort(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); if (test_opt(sb, ERRORS_PANIC)) @@ -300,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function, journal_abort(EXT3_SB(sb)->s_journal, -EIO); } -void ext3_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext3_warning(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", - sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } @@ -346,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -363,8 +381,7 @@ fail: */ static int ext3_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext3_blkdev_remove(struct ext3_sb_info *sbi) @@ -479,6 +496,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + static void ext3_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT3_I(inode)->i_orphan))) { @@ -489,7 +513,7 @@ static void ext3_destroy_inode(struct inode *inode) false); dump_stack(); } - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); + call_rcu(&inode->i_rcu, ext3_i_callback); } static void init_once(void *foo) @@ -730,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot); static int ext3_mark_dquot_dirty(struct dquot *dquot); static int ext3_write_info(struct super_block *sb, int type); static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path); + struct path *path); static int ext3_quota_on_mount(struct super_block *sb, int type); static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1841,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - if (generic_check_addressable(sb->s_blocksize_bits, - le32_to_cpu(es->s_blocks_count))) { + err = generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count)); + if (err) { ext3_msg(sb, KERN_ERR, "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) ext3_msg(sb, KERN_ERR, "error: CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -2135,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext3_msg(sb, KERN_ERR, - "error: failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -2290,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb, EXT3_SB(sb)->s_journal = journal; ext3_clear_journal_err(sb, es); - if (journal_devnum && + if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); @@ -2858,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *name) + struct path *path) { int err; - struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - err = kern_path(name, LOOKUP_FOLLOW, &path); - if (err) - return err; - /* Quotafile not on the same filesystem? */ - if (path.mnt->mnt_sb != sb) { - path_put(&path); + if (path->mnt->mnt_sb != sb) return -EXDEV; - } /* Journaling quota? */ if (EXT3_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ - if (path.dentry->d_parent != sb->s_root) + if (path->dentry->d_parent != sb->s_root) ext3_msg(sb, KERN_WARNING, "warning: Quota file not on filesystem root. " "Journaled quota will not work."); @@ -2888,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext3_should_journal_data(path.dentry->d_inode)) { + if (ext3_should_journal_data(path->dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -2896,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, journal_lock_updates(EXT3_SB(sb)->s_journal); err = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err) { - path_put(&path); + if (err) return err; - } } - err = dquot_quota_on_path(sb, type, format_id, &path); - path_put(&path); - return err; + return dquot_quota_on(sb, type, format_id, path); } /* Read data from quotafile - avoid pagecache and such because we cannot afford diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index e69dc6dfaa8..32e6cc23bd9 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext3_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 5e2ed4504ea..e0270d1f8d8 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_check_acl(struct inode *inode, int mask) +ext4_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9d843d5deac..dec821168fd 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int); +extern int ext4_check_acl(struct inode *, int, unsigned int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 14c3af26c67..adf96b82278 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -592,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ - if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { + if (!(*errp) && + ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ece76fb6a40..164c56092e5 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext4_filetype_table[filetype]); } - +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + */ int __ext4_check_dir_entry(const char *function, unsigned int line, - struct inode *dir, + struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, unsigned int offset) @@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (rlen < EXT4_DIR_REC_LEN(1)) + if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) error_msg = "directory entry across blocks"; - else if (le32_to_cpu(de->inode) > - le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else + return 0; - if (error_msg != NULL) - ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - " - "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), offset, - le32_to_cpu(de->inode), - rlen, de->name_len); - return error_msg == NULL ? 1 : 0; + if (filp) + ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + else + ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + + return 1; } static int ext4_readdir(struct file *filp, @@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - EXT4_ERROR_INODE(inode, "directory " - "contains a hole at offset %Lu", + EXT4_ERROR_FILE(filp, 0, + "directory contains a " + "hole at offset %llu", (unsigned long long) filp->f_pos); dir_has_error = 1; } @@ -194,8 +210,8 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry(inode, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, filp, de, + bh, offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 94ce3d7a1c4..0c8d97b56f3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -62,8 +62,8 @@ #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) -#define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -561,23 +561,7 @@ struct ext4_new_group_data { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif - -/* - * Mount options - */ -struct ext4_mount_options { - unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; - unsigned long s_commit_interval; - u32 s_min_batch_time, s_max_batch_time; -#ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; -#endif -}; - -/* Max physical block we can addres w/o extents */ +/* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* @@ -709,6 +693,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ ext4_decode_extra_time(&(inode)->xtime, \ raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ @@ -719,6 +705,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ } while (0) #define i_disk_version osd1.linux1.l_i_version @@ -750,12 +738,13 @@ do { \ /* * storage for cached extent + * If ec_len == 0, then the cache is invalid. + * If ec_start == 0, then the cache represents a gap (null mapping) */ struct ext4_ext_cache { ext4_fsblk_t ec_start; ext4_lblk_t ec_block; __u32 ec_len; /* must be 32bit to return holes */ - __u32 ec_type; }; /* @@ -774,10 +763,12 @@ struct ext4_inode_info { * near to their parent directory's inode. */ ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ +#endif unsigned long i_flags; - ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file @@ -820,7 +811,7 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; - struct jbd2_inode jinode; + struct jbd2_inode *jinode; struct ext4_ext_cache i_cached_extent; /* @@ -840,14 +831,12 @@ struct ext4_inode_info { unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; - sector_t i_da_metadata_calc_last_lblock; + ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; /* on-disk additional length */ __u16 i_extra_isize; - spinlock_t i_block_reservation_lock; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -856,9 +845,11 @@ struct ext4_inode_info { /* completed IOs that might need unwritten extents handling */ struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ + + spinlock_t i_block_reservation_lock; /* * Transactions that contain inode's metadata needed to complete @@ -917,11 +908,20 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ -#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt -#define set_opt(o, opt) o |= EXT4_MOUNT_##opt +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + #define ext4_set_bit ext2_set_bit #define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_clear_bit ext2_clear_bit @@ -1087,6 +1087,7 @@ struct ext4_sb_info { struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ struct buffer_head **s_group_desc; unsigned int s_mount_opt; + unsigned int s_mount_opt2; unsigned int s_mount_flags; ext4_fsblk_t s_sb_block; uid_t s_resuid; @@ -1237,24 +1238,39 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ }; -#define EXT4_INODE_BIT_FNS(name, field) \ +#define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ - return test_bit(bit, &EXT4_I(inode)->i_##field); \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ - set_bit(bit, &EXT4_I(inode)->i_##field); \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ - clear_bit(bit, &EXT4_I(inode)->i_##field); \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } -EXT4_INODE_BIT_FNS(flag, flags) -EXT4_INODE_BIT_FNS(state, state_flags) +EXT4_INODE_BIT_FNS(flag, flags, 0) +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1642,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, de, bh, offset) \ - __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) +#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1653,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ extern int ext4_sync_file(struct file *, int); +extern int ext4_flush_completed_IO(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -1752,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, const char *, ...) __attribute__ ((format (printf, 5, 6))); extern void ext4_error_file(struct file *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern void __ext4_abort(struct super_block *, const char *, unsigned int, @@ -2046,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); -extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 28ce70fd9cd..2e29abb30f7 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -119,10 +119,6 @@ struct ext4_ext_path { * structure for external API */ -#define EXT4_EXT_CACHE_NO 0 -#define EXT4_EXT_CACHE_GAP 1 -#define EXT4_EXT_CACHE_EXTENT 2 - /* * to be called by ext4_ext_walk_space() * negative retcode - error @@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode) static inline void ext4_ext_invalidate_cache(struct inode *inode) { - EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; + EXT4_I(inode)->i_cached_extent.ec_len = 0; } static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) @@ -278,7 +274,7 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, } extern int ext4_ext_calc_metadata_amount(struct inode *inode, - sector_t lblocks); + ext4_lblk_t lblocks); extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b0bd792c58c..d8b992e658c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal) static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0554c48cb1f..63a75810b7c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -117,11 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_extent *ex; depth = path->p_depth; - /* try to predict block placement */ + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especiially if the latter case turns out to be + * common. + */ ex = path[depth].p_ext; - if (ex) - return (ext4_ext_pblock(ex) + - (block - le32_to_cpu(ex->ee_block))); + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } /* it looks like index is empty; * try to find starting block from index itself */ @@ -244,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) * to allocate @blocks * Worse case is one block per extent */ -int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); int idxs, num = 0; @@ -1872,12 +1894,10 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_block = start; cbex.ec_len = end - start; cbex.ec_start = 0; - cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext4_ext_pblock(ex); - cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } if (unlikely(cbex.ec_len == 0)) { @@ -1917,13 +1937,12 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start, int type) + __u32 len, ext4_fsblk_t start) { struct ext4_ext_cache *cex; BUG_ON(len == 0); spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_type = type; cex->ec_block = block; cex->ec_len = len; cex->ec_start = start; @@ -1976,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); + ext4_ext_put_in_cache(inode, lblock, len, 0); } +/* + * Return 0 if cache is invalid; 1 if the cache is valid + */ static int ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; - int ret = EXT4_EXT_CACHE_NO; + int ret = 0; /* * We borrow i_block_reservation_lock to protect i_cached_extent @@ -1993,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, cex = &EXT4_I(inode)->i_cached_extent; /* has cache valid data? */ - if (cex->ec_type == EXT4_EXT_CACHE_NO) + if (cex->ec_len == 0) goto errout; - BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && - cex->ec_type != EXT4_EXT_CACHE_EXTENT); if (in_range(block, cex->ec_block, cex->ec_len)) { ex->ee_block = cpu_to_le32(cex->ec_block); ext4_ext_store_pblock(ex, cex->ec_start); @@ -2005,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); - ret = cex->ec_type; + ret = 1; } errout: spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2825,14 +2845,14 @@ fix_extent_len: * to an uninitialized extent. * * Writing to an uninitized extent may result in splitting the uninitialized - * extent into multiple /intialized unintialized extents (up to three) + * extent into multiple /initialized uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * One of more index blocks maybe needed if the extent tree grow after - * the unintialized extent split. To prevent ENOSPC occur at the IO + * the uninitialized extent split. To prevent ENOSPC occur at the IO * complete, we need to split the uninitialized extent before DIO submit * the IO. The uninitialized extent called at this time will be split * into three uninitialized extent(at most). After IO complete, the part @@ -3082,7 +3102,7 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev, * Handle EOFBLOCKS_FL flag, clearing it if necessary */ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, + ext4_lblk_t lblk, struct ext4_ext_path *path, unsigned int len) { @@ -3112,7 +3132,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, * this turns out to be false, we can bail out from this * function immediately. */ - if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) + + if (lblk + len < le32_to_cpu(last_ex->ee_block) + ext4_ext_get_actual_len(last_ex)) return 0; /* @@ -3168,8 +3188,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map, path, - map->m_len); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); } else err = ret; goto out2; @@ -3199,7 +3219,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, map, path); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map, path, map->m_len); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); if (err < 0) goto out2; } @@ -3276,7 +3297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent_header *eh; struct ext4_extent newex, *ex; ext4_fsblk_t newblock; - int err = 0, depth, ret, cache_type; + int err = 0, depth, ret; unsigned int allocated = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; @@ -3285,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_lblk, map->m_len, inode->i_ino); /* check in cache */ - cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); - if (cache_type) { - if (cache_type == EXT4_EXT_CACHE_GAP) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and @@ -3296,7 +3316,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { + } else { /* block is already allocated */ newblock = map->m_lblk - le32_to_cpu(newex.ee_block) @@ -3305,8 +3325,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, allocated = ext4_ext_get_actual_len(&newex) - (map->m_lblk - le32_to_cpu(newex.ee_block)); goto out; - } else { - BUG(); } } @@ -3357,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* Do not put uninitialized extent in the cache */ if (!ext4_ext_is_uninitialized(ex)) { ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start, - EXT4_EXT_CACHE_EXTENT); + ee_len, ee_start); goto out; } ret = ext4_ext_handle_uninitialized_extents(handle, @@ -3456,7 +3473,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_flags |= EXT4_MAP_UNINIT; } - err = check_eofblocks_fl(handle, inode, map, path, ar.len); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); if (err) goto out2; @@ -3490,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * when it is _not_ an uninitialized extent. */ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); ext4_update_inode_fsync_trans(handle, inode, 1); } else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -3519,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode) int err = 0; /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_completed_IO(inode); + + /* * probably first extent we're gonna free will be last in block */ err = ext4_writepage_trans_blocks(inode); @@ -3605,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode, } /* - * preallocate space for a file. This implements ext4's fallocate inode + * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ -long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; handle_t *handle; loff_t new_size; unsigned int max_blocks; @@ -3622,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + /* * currently supporting (pre)allocate mode for extent-based * files _only_ @@ -3629,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* preallocation to directories is currently not supported */ - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because @@ -3767,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, logical = (__u64)newex->ec_block << blksize_bits; - if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + if (newex->ec_start == 0) { pgoff_t offset; struct page *page; struct buffer_head *bh = NULL; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5a5c55ddcee..2e8322c8aa8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -127,6 +128,27 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_mark_super_dirty(sb); } } + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { + struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); + + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + } return dquot_file_open(inode, filp); } @@ -188,6 +210,7 @@ const struct file_operations ext4_file_operations = { .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { @@ -201,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = { .removexattr = generic_removexattr, #endif .check_acl = ext4_check_acl, - .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index c1a7bc923cf..7829b287822 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode) * to written. * The function return the number of pending IOs on success. */ -static int flush_completed_IO(struct inode *inode) +extern int ext4_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; struct ext4_inode_info *ei = EXT4_I(inode); @@ -169,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; - ret = flush_completed_IO(inode); + ret = ext4_flush_completed_IO(inode); if (ret < 0) return ret; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1ce240a23eb..eb9097aec6f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1027,7 +1027,7 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e659597b690..9f7f9e49914 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -39,7 +39,9 @@ #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> +#include <linux/printk.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -54,10 +56,17 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); - return jbd2_journal_begin_ordered_truncate( - EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode, - new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); @@ -552,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, } /** - * ext4_blks_to_allocate: Look up the block map and count the number + * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks @@ -591,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, /** * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation * @indirect_blks: the number of blocks need to allocate for indirect * blocks - * + * @blks: number of desired blocks * @new_blocks: on return it will store the new block numbers for * the indirect blocks(if needed) and the first direct block, - * @blks: on return it will store the total number of allocated - * direct blocks + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, ext4_fsblk_t goal, @@ -711,9 +726,11 @@ failed_out: /** * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction * @inode: owner * @indirect_blks: number of allocated indirect blocks * @blks: number of allocated direct blocks + * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * @@ -826,6 +843,7 @@ failed: /** * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction * @inode: owner * @block: (logical) number of block we are adding * @chain: chain of indirect blocks (with a missing link - see @@ -1081,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) +static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); @@ -1320,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * avoid double accounting */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - EXT4_I(inode)->i_delalloc_reserved_flag = 1; + ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -1350,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_da_update_reserve_space(inode, retval, 1); } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - EXT4_I(inode)->i_delalloc_reserved_flag = 0; + ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -1878,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file, /* * Reserve a single block located at lblock */ -static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2239,7 +2257,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) * affects functions in many different parts of the allocation * call path. This flag exists primarily because we don't * want to change *many* call functions, so ext4_map_blocks() - * will set the magic i_delalloc_reserved_flag once the + * will set the EXT4_STATE_DELALLOC_RESERVED flag once the * inode's allocation semaphore is taken. * * If the blocks in questions were delalloc blocks, set @@ -3362,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of - * the pages by calling redirty_page_for_writeback() but that + * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them becuase we wouldn't actually intend to @@ -3720,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) retry: io_end = ext4_init_io_end(inode, GFP_ATOMIC); if (!io_end) { - if (printk_ratelimit()) - printk(KERN_WARNING "%s: allocation fail\n", __func__); + pr_warn_ratelimited("%s: allocation fail\n", __func__); schedule(); goto retry; } @@ -3745,9 +3762,9 @@ retry: * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as unintialized + * For holes, we fallocate those blocks, mark them as uninitialized * If those blocks were preallocated, we mark sure they are splited, but - * still keep the range to write as unintialized. + * still keep the range to write as uninitialized. * * The unwrritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -4045,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle, if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); } else { - if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } @@ -4169,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, { __le32 *p; int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; @@ -4184,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; + } + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; + } + err = ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; } - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext4_journal_get_write_access(handle, bh); @@ -4349,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); + brelse(bh); /* * Everything below this this pointer has been @@ -4859,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -5118,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b4d4e3a4d5..851f49b2f9d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2608,18 +2608,12 @@ int ext4_mb_release(struct super_block *sb) static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t block, int count) { - int ret; ext4_fsblk_t discard_block; discard_block = block + ext4_group_first_block_no(sb, block_group); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); - if (ret == -EOPNOTSUPP) { - ext4_warning(sb, "discard not supported, disabling"); - clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); - } - return ret; + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } /* @@ -2631,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, ret, count = 0, count2 = 0; struct ext4_free_data *entry; struct list_head *l, *ltmp; @@ -2641,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, + if (test_opt(sb, DISCARD)) { + ret = ext4_issue_discard(sb, entry->group, entry->start_blk, entry->count); + if (unlikely(ret == -EOPNOTSUPP)) { + ext4_warning(sb, "discard not supported, " + "disabling"); + clear_opt(sb, DISCARD); + } + } err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -3881,19 +3881,6 @@ repeat: } } -/* - * finds all preallocated spaces and return blocks being freed to them - * if preallocated space becomes full (no block is used from the space) - * then the function frees space in buddy - * XXX: at the moment, truncate (which is the only way to free blocks) - * discards all preallocations - */ -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, - sector_t block, int count) -{ - BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); -} #ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { @@ -4283,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * EDQUOT check, as blocks and quotas have been already * reserved when data being copied into pagecache. */ - if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) ar->flags |= EXT4_MB_DELALLOC_RESERVED; else { /* Without delayed allocation we need to verify @@ -4380,7 +4367,8 @@ out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); if (!ar->len) { - if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); @@ -4626,7 +4614,11 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + if (!new_entry) { + err = -ENOMEM; + goto error_return; + } new_entry->start_blk = bit; new_entry->group = block_group; new_entry->count = count; @@ -4643,7 +4635,6 @@ do_more: ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); - ext4_mb_return_to_preallocation(inode, &e4b, block, count); } ret = ext4_free_blks_count(sb, gdp) + count; @@ -4718,8 +4709,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count); - if (ret) - ext4_std_error(sb, ret); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); @@ -4819,6 +4808,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; uint64_t start, len, minlen, trimmed; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -4828,6 +4819,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; + if (start < first_data_blk) { + len -= first_data_blk - start; + start = first_data_blk; + } /* Determine first and last group to examine based on start and len */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, @@ -4851,7 +4846,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (len >= EXT4_BLOCKS_PER_GROUP(sb)) len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); else - last_block = len; + last_block = first_block + len; if (e4b.bd_info->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, &e4b, first_block, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 25f3a974b72..b0a126f23c2 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode) goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, 0, goal); + S_IFREG, NULL, goal); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dc40e75cba8..5485390d32c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry(dir, de, bh, - (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) - +((char *)de - bh->b_data))) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ dir_file->f_pos = (dir_file->f_pos | (dir->i_sb->s_blocksize - 1)) + 1; @@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1036,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { + if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ESTALE) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1269,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (!ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1602,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; + } } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1630,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle, { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int i; + int i, err; i = 0; pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (!ext4_check_dir_entry(dir, de, bh, i)) + if (ext4_check_dir_entry(dir, NULL, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); - ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -1652,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -1789,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) { handle_t *handle; struct inode *inode; - struct buffer_head *dir_block; + struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; @@ -1822,7 +1834,9 @@ retry: if (!dir_block) goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); - ext4_journal_get_write_access(handle, dir_block); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; de = (struct ext4_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; @@ -1839,10 +1853,12 @@ retry: ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, dir_block); - brelse(dir_block); - ext4_mark_inode_dirty(handle, inode); - err = ext4_add_entry(handle, dentry, inode); + err = ext4_handle_dirty_metadata(handle, dir, dir_block); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); @@ -1853,10 +1869,13 @@ out_clear_inode: } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; d_instantiate(dentry, inode); unlock_new_inode(inode); out_stop: + brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -1919,7 +1938,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (!ext4_check_dir_entry(inode, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; @@ -2407,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, new_dir, new_bh); + retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } brelse(new_bh); new_bh = NULL; } @@ -2459,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + if (retval) { + ext4_std_error(old_dir->i_sb, retval); + goto end_rename; + } ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index beacce11ac5..7270dcfca92 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -44,7 +44,7 @@ int __init ext4_init_pageio(void) if (io_page_cachep == NULL) return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) { + if (io_end_cachep == NULL) { kmem_cache_destroy(io_page_cachep); return -ENOMEM; } @@ -158,11 +158,8 @@ static void ext4_end_io_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = NULL; - - io = kmem_cache_alloc(io_end_cachep, flags); + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); if (io) { - memset(io, 0, sizeof(*io)); atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 981c8477ada..3ecc6e45d2f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -220,7 +220,11 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext4_handle_dirty_metadata(handle, NULL, gdb); + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto exit_bh; + } ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -258,7 +262,11 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_bh; + } brelse(bh); /* Mark unused entries in inode bitmap used */ ext4_debug("clear inode bitmap %#04llx (+%llu)\n", @@ -270,7 +278,9 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); exit_bh: brelse(bh); @@ -422,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dind; } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (unlikely(err)) goto exit_dind; - if ((err = ext4_journal_get_write_access(handle, *primary))) + err = ext4_journal_get_write_access(handle, *primary); + if (unlikely(err)) goto exit_sbh; - if ((err = ext4_journal_get_write_access(handle, dind))) - goto exit_primary; + err = ext4_journal_get_write_access(handle, dind); + if (unlikely(err)) + ext4_std_error(sb, err); /* ext4_reserve_inode_write() gets a reference on the iloc */ - if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) goto exit_dindj; n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -454,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - ext4_handle_dirty_metadata(handle, NULL, dind); - brelse(dind); + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); memset((*primary)->b_data, 0, sb->s_blocksize); - ext4_handle_dirty_metadata(handle, NULL, *primary); + err = ext4_handle_dirty_metadata(handle, NULL, *primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + brelse(dind); o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -470,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + if (err) + ext4_std_error(sb, err); - return 0; + return err; exit_inode: /* ext4_journal_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: /* ext4_journal_release_buffer(handle, dind); */ -exit_primary: - /* ext4_journal_release_buffer(handle, *primary); */ exit_sbh: - /* ext4_journal_release_buffer(handle, *primary); */ + /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -665,7 +687,9 @@ static void update_backups(struct super_block *sb, memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); brelse(bh); } if ((err2 = ext4_journal_stop(handle)) && !err) @@ -883,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext4_handle_dirty_metadata(handle, NULL, primary); + err = ext4_handle_dirty_metadata(handle, NULL, primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_journal; + } /* Update the reserved block counts only once the new group is * active. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb15c9c0be7..48ce561fafa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -388,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb) void __ext4_error(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", - sb->s_id, function, line, current->comm); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); va_end(args); ext4_handle_error(sb); @@ -405,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function, const char *fmt, ...) { va_list args; + struct va_format vaf; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); save_error_info(inode->i_sb, function, line); va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", inode->i_sb->s_id, function, line, inode->i_ino); if (block) - printk("block %llu: ", block); - printk("comm %s: ", current->comm); - vprintk(fmt, args); - printk("\n"); + printk(KERN_CONT "block %llu: ", block); + printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); va_end(args); ext4_handle_error(inode->i_sb); } void ext4_error_file(struct file *file, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; + struct va_format vaf; struct ext4_super_block *es; struct inode *inode = file->f_dentry->d_inode; char pathname[80], *path; @@ -434,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); save_error_info(inode->i_sb, function, line); - va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); - if (!path) + if (IS_ERR(path)) path = "(unknown)"; printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu " - "(comm %s path %s): ", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, path); - vprintk(fmt, args); - printk("\n"); + "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk(KERN_CONT "block %llu: ", block); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -543,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function, panic("EXT4-fs panic from previous error\n"); } -void ext4_msg (struct super_block * sb, const char *prefix, - const char *fmt, ...) +void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk("%sEXT4-fs (%s): ", prefix, sb->s_id); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); va_end(args); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", - sb->s_id, function, line); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); va_end(args); } @@ -575,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line, __releases(bitlock) __acquires(bitlock) { + struct va_format vaf; va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", sb->s_id, function, line, grp); if (ino) - printk("inode %lu: ", ino); + printk(KERN_CONT "inode %lu: ", ino); if (block) - printk("block %llu:", (unsigned long long) block); - vprintk(fmt, args); - printk("\n"); + printk(KERN_CONT "block %llu:", (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); va_end(args); if (test_opt(sb, ERRORS_CONT)) { @@ -647,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -663,8 +673,7 @@ fail: */ static int ext4_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext4_blkdev_remove(struct ext4_sb_info *sbi) @@ -808,21 +817,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); - /* - * Note: We can be called before EXT4_SB(sb)->s_journal is set, - * therefore it can be null here. Don't check it, just initialize - * jinode. - */ - jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; - ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif + ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_completed_io_list); spin_lock_init(&ei->i_completed_io_lock); ei->cur_aio_dio = NULL; @@ -841,6 +844,13 @@ static int ext4_drop_inode(struct inode *inode) return drop; } +static void ext4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + static void ext4_destroy_inode(struct inode *inode) { ext4_ioend_wait(inode); @@ -853,7 +863,7 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } - kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); + call_rcu(&inode->i_rcu, ext4_i_callback); } static void init_once(void *foo) @@ -891,9 +901,12 @@ void ext4_clear_inode(struct inode *inode) end_writeback(inode); dquot_drop(inode); ext4_discard_preallocations(inode); - if (EXT4_JOURNAL(inode)) - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } } static inline void ext4_show_quota_options(struct seq_file *seq, @@ -1148,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *path); + struct path *path); static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, @@ -1386,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) sbi->s_qf_names[qtype] = NULL; return 0; } - set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sb, QUOTA); return 1; } @@ -1441,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb, switch (token) { case Opt_bsd_df: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sbi->s_mount_opt, MINIX_DF); + clear_opt(sb, MINIX_DF); break; case Opt_minix_df: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sbi->s_mount_opt, MINIX_DF); + set_opt(sb, MINIX_DF); break; case Opt_grpid: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sbi->s_mount_opt, GRPID); + set_opt(sb, GRPID); break; case Opt_nogrpid: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sbi->s_mount_opt, GRPID); + clear_opt(sb, GRPID); break; case Opt_resuid: @@ -1473,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb, /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - clear_opt(sbi->s_mount_opt, ERRORS_CONT); - clear_opt(sbi->s_mount_opt, ERRORS_RO); - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + set_opt(sb, ERRORS_PANIC); break; case Opt_err_ro: - clear_opt(sbi->s_mount_opt, ERRORS_CONT); - clear_opt(sbi->s_mount_opt, ERRORS_PANIC); - set_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_RO); break; case Opt_err_cont: - clear_opt(sbi->s_mount_opt, ERRORS_RO); - clear_opt(sbi->s_mount_opt, ERRORS_PANIC); - set_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_CONT); break; case Opt_nouid32: - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(sb, NO_UID32); break; case Opt_debug: - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(sb, DEBUG); break; case Opt_oldalloc: - set_opt(sbi->s_mount_opt, OLDALLOC); + set_opt(sb, OLDALLOC); break; case Opt_orlov: - clear_opt(sbi->s_mount_opt, OLDALLOC); + clear_opt(sb, OLDALLOC); break; #ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(sb, XATTR_USER); break; case Opt_nouser_xattr: - clear_opt(sbi->s_mount_opt, XATTR_USER); + clear_opt(sb, XATTR_USER); break; #else case Opt_user_xattr: @@ -1514,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb, #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL case Opt_acl: - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(sb, POSIX_ACL); break; case Opt_noacl: - clear_opt(sbi->s_mount_opt, POSIX_ACL); + clear_opt(sb, POSIX_ACL); break; #else case Opt_acl: @@ -1536,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb, "Cannot specify journal on remount"); return 0; } - set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); + set_opt(sb, UPDATE_JOURNAL); break; case Opt_journal_dev: if (is_remount) { @@ -1549,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + set_opt(sb, JOURNAL_CHECKSUM); break; case Opt_journal_async_commit: - set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + set_opt(sb, JOURNAL_ASYNC_COMMIT); + set_opt(sb, JOURNAL_CHECKSUM); break; case Opt_noload: - set_opt(sbi->s_mount_opt, NOLOAD); + set_opt(sb, NOLOAD); break; case Opt_commit: if (match_int(&args[0], &option)) @@ -1599,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); + clear_opt(sb, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; case Opt_data_err_abort: - set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + set_opt(sb, DATA_ERR_ABORT); break; case Opt_data_err_ignore: - clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + clear_opt(sb, DATA_ERR_ABORT); break; #ifdef CONFIG_QUOTA case Opt_usrjquota: @@ -1647,12 +1660,12 @@ set_qf_format: break; case Opt_quota: case Opt_usrquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, USRQUOTA); + set_opt(sb, QUOTA); + set_opt(sb, USRQUOTA); break; case Opt_grpquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, GRPQUOTA); + set_opt(sb, QUOTA); + set_opt(sb, GRPQUOTA); break; case Opt_noquota: if (sb_any_quota_loaded(sb)) { @@ -1660,9 +1673,9 @@ set_qf_format: "options when quota turned on"); return 0; } - clear_opt(sbi->s_mount_opt, QUOTA); - clear_opt(sbi->s_mount_opt, USRQUOTA); - clear_opt(sbi->s_mount_opt, GRPQUOTA); + clear_opt(sb, QUOTA); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); break; #else case Opt_quota: @@ -1688,7 +1701,7 @@ set_qf_format: sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; break; case Opt_nobarrier: - clear_opt(sbi->s_mount_opt, BARRIER); + clear_opt(sb, BARRIER); break; case Opt_barrier: if (args[0].from) { @@ -1697,9 +1710,9 @@ set_qf_format: } else option = 1; /* No argument, default to 1 */ if (option) - set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sb, BARRIER); else - clear_opt(sbi->s_mount_opt, BARRIER); + clear_opt(sb, BARRIER); break; case Opt_ignore: break; @@ -1723,17 +1736,17 @@ set_qf_format: "Ignoring deprecated bh option"); break; case Opt_i_version: - set_opt(sbi->s_mount_opt, I_VERSION); + set_opt(sb, I_VERSION); sb->s_flags |= MS_I_VERSION; break; case Opt_nodelalloc: - clear_opt(sbi->s_mount_opt, DELALLOC); + clear_opt(sb, DELALLOC); break; case Opt_mblk_io_submit: - set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + set_opt(sb, MBLK_IO_SUBMIT); break; case Opt_nomblk_io_submit: - clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + clear_opt(sb, MBLK_IO_SUBMIT); break; case Opt_stripe: if (match_int(&args[0], &option)) @@ -1743,13 +1756,13 @@ set_qf_format: sbi->s_stripe = option; break; case Opt_delalloc: - set_opt(sbi->s_mount_opt, DELALLOC); + set_opt(sb, DELALLOC); break; case Opt_block_validity: - set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + set_opt(sb, BLOCK_VALIDITY); break; case Opt_noblock_validity: - clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + clear_opt(sb, BLOCK_VALIDITY); break; case Opt_inode_readahead_blks: if (match_int(&args[0], &option)) @@ -1773,7 +1786,7 @@ set_qf_format: option); break; case Opt_noauto_da_alloc: - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + set_opt(sb, NO_AUTO_DA_ALLOC); break; case Opt_auto_da_alloc: if (args[0].from) { @@ -1782,24 +1795,24 @@ set_qf_format: } else option = 1; /* No argument, default to 1 */ if (option) - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + clear_opt(sb, NO_AUTO_DA_ALLOC); else - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + set_opt(sb,NO_AUTO_DA_ALLOC); break; case Opt_discard: - set_opt(sbi->s_mount_opt, DISCARD); + set_opt(sb, DISCARD); break; case Opt_nodiscard: - clear_opt(sbi->s_mount_opt, DISCARD); + clear_opt(sb, DISCARD); break; case Opt_dioread_nolock: - set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); break; case Opt_dioread_lock: - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); break; case Opt_init_inode_table: - set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + set_opt(sb, INIT_INODE_TABLE); if (args[0].from) { if (match_int(&args[0], &option)) return 0; @@ -1810,7 +1823,7 @@ set_qf_format: sbi->s_li_wait_mult = option; break; case Opt_noinit_inode_table: - clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + clear_opt(sb, INIT_INODE_TABLE); break; default: ext4_msg(sb, KERN_ERR, @@ -1822,10 +1835,10 @@ set_qf_format: #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) - clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sb, USRQUOTA); if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) - clear_opt(sbi->s_mount_opt, GRPQUOTA); + clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext4_msg(sb, KERN_ERR, "old and new quota " @@ -1895,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " - "bpg=%lu, ipg=%lu, mo=%04x]\n", + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), - sbi->s_mount_opt); + sbi->s_mount_opt, sbi->s_mount_opt2); return res; } @@ -1930,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb) size = flex_group_count * sizeof(struct flex_groups); sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vmalloc(size); - if (sbi->s_flex_groups) - memset(sbi->s_flex_groups, 0, size); - } - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for " - "%u flex groups", flex_group_count); - goto failed; + sbi->s_flex_groups = vzalloc(size); + if (sbi->s_flex_groups == NULL) { + ext4_msg(sb, KERN_ERR, + "not enough memory for %u flex groups", + flex_group_count); + goto failed; + } } for (i = 0; i < sbi->s_groups_count; i++) { @@ -2916,7 +2928,7 @@ static int ext4_register_li_request(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; - int ret; + int ret = 0; if (sbi->s_li_request != NULL) return 0; @@ -3071,41 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - set_opt(sbi->s_mount_opt, INIT_INODE_TABLE); + set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", "2.6.38"); - set_opt(sbi->s_mount_opt, GRPID); + set_opt(sb, GRPID); } if (def_mount_opts & EXT4_DEFM_UID16) - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(sb, NO_UID32); #ifdef CONFIG_EXT4_FS_XATTR if (def_mount_opts & EXT4_DEFM_XATTR_USER) - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(sb, XATTR_USER); #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL if (def_mount_opts & EXT4_DEFM_ACL) - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(sb, POSIX_ACL); #endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - set_opt(sbi->s_mount_opt, JOURNAL_DATA); + set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) - set_opt(sbi->s_mount_opt, ERRORS_CONT); + set_opt(sb, ERRORS_CONT); else - set_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sb, ERRORS_RO); if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) - set_opt(sbi->s_mount_opt, DISCARD); + set_opt(sb, DISCARD); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -3114,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) - set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sb, BARRIER); /* * enable delayed allocation by default @@ -3122,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (!IS_EXT3_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) - set_opt(sbi->s_mount_opt, DELALLOC); + set_opt(sb, DELALLOC); if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, &journal_devnum, &journal_ioprio, NULL, 0)) { @@ -3425,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "suppressed and not mounted read-only"); goto failed_mount_wq; } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + clear_opt(sb, DATA_FLAGS); + set_opt(sb, WRITEBACK_DATA); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -3464,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sb, ORDERED_DATA); else - set_opt(sbi->s_mount_opt, JOURNAL_DATA); + set_opt(sb, JOURNAL_DATA); break; case EXT4_MOUNT_ORDERED_DATA: @@ -3556,18 +3568,18 @@ no_journal: (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " "requested data journaling mode"); - clear_opt(sbi->s_mount_opt, DELALLOC); + clear_opt(sb, DELALLOC); } if (test_opt(sb, DIOREAD_NOLOCK)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " "option - requested data journaling mode"); - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); } if (sb->s_blocksize < PAGE_SIZE) { ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " "option - block size is too small"); - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); } } @@ -3765,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext4_msg(sb, KERN_ERR, - "failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -4166,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb) return 0; } +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + unsigned long s_mount_opt2; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; @@ -4186,6 +4207,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; @@ -4339,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) restore_opts: sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; @@ -4535,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *name) + struct path *path) { int err; - struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - err = kern_path(name, LOOKUP_FOLLOW, &path); - if (err) - return err; - /* Quotafile not on the same filesystem? */ - if (path.mnt->mnt_sb != sb) { - path_put(&path); + if (path->mnt->mnt_sb != sb) return -EXDEV; - } /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ - if (path.dentry->d_parent != sb->s_root) + if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); @@ -4566,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * all updates to the file when we bypass pagecache... */ if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(path.dentry->d_inode)) { + ext4_should_journal_data(path->dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -4574,15 +4590,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - if (err) { - path_put(&path); + if (err) return err; - } } - err = dquot_quota_on_path(sb, type, format_id, &path); - path_put(&path); - return err; + return dquot_quota_on(sb, type, format_id, path); } static int ext4_quota_off(struct super_block *sb, int type) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fa4b899da4b..fc32176eee3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -427,23 +427,23 @@ cleanup: static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - int i_error, b_error; + int ret, ret2; down_read(&EXT4_I(dentry->d_inode)->xattr_sem); - i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); - if (i_error < 0) { - b_error = 0; - } else { - if (buffer) { - buffer += i_error; - buffer_size -= i_error; - } - b_error = ext4_xattr_block_list(dentry, buffer, buffer_size); - if (b_error < 0) - i_error = 0; + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: up_read(&EXT4_I(dentry->d_inode)->xattr_sem); - return i_error + b_error; + return ret; } /* @@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext4_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE diff --git a/fs/fat/fat.h b/fs/fat/fat.h index d75a77f85c2..f50408901f7 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos); extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, void *data, int silent, - const struct inode_operations *fs_dir_inode_ops, int isvfat); + const struct inode_operations *fs_dir_inode_ops, + int isvfat, void (*setup)(struct super_block *)); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index ad6998a92c3..86753fe10bd 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void fat_destroy_inode(struct inode *inode) +static void fat_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } +static void fat_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, fat_i_callback); +} + static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; @@ -696,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode = NULL; - struct dentry *result; u32 *fh = fid->raw; if (fh_len < 5 || fh_type != 3) @@ -741,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb, * the fat_iget lookup again. If that fails, then we are totally out * of luck. But all that is for another day */ - result = d_obtain_alias(inode); - if (!IS_ERR(result)) - result->d_op = sb->s_root->d_op; - return result; + return d_obtain_alias(inode); } static int @@ -792,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child) brelse(bh); parent = d_obtain_alias(inode); - if (!IS_ERR(parent)) - parent->d_op = sb->s_root->d_op; out: unlock_super(sb); @@ -1237,7 +1238,8 @@ static int fat_read_root(struct inode *inode) * Read the super block of an MS-DOS FS. */ int fat_fill_super(struct super_block *sb, void *data, int silent, - const struct inode_operations *fs_dir_inode_ops, int isvfat) + const struct inode_operations *fs_dir_inode_ops, int isvfat, + void (*setup)(struct super_block *)) { struct inode *root_inode = NULL, *fat_inode = NULL; struct buffer_head *bh; @@ -1273,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, if (error) goto out_fail; + setup(sb); /* flavour-specific stuff that needs options */ + error = -EIO; sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 3345aabd1dd..711499040eb 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len, * that the existing dentry can be used. The msdos fs routines will * return ENOENT or EINVAL as appropriate. */ -static int msdos_hash(struct dentry *dentry, struct qstr *qstr) +static int msdos_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; unsigned char msdos_name[MSDOS_NAME]; @@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr) * Compare two msdos names. If either of the names are invalid, * we fall back to doing the standard name comparison. */ -static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; + struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; int error; - error = msdos_format_name(a->name, a->len, a_msdos_name, options); + error = msdos_format_name(name->name, name->len, a_msdos_name, options); if (error) goto old_compare; - error = msdos_format_name(b->name, b->len, b_msdos_name, options); + error = msdos_format_name(str, len, b_msdos_name, options); if (error) goto old_compare; error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME); @@ -182,8 +185,8 @@ out: old_compare: error = 1; - if (a->len == b->len) - error = memcmp(a->name, b->name, a->len); + if (name->len == len) + error = memcmp(name->name, str, len); goto out; } @@ -224,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, } out: unlock_super(sb); - dentry->d_op = &msdos_dentry_operations; - dentry = d_splice_alias(inode, dentry); - if (dentry) - dentry->d_op = &msdos_dentry_operations; - return dentry; + return d_splice_alias(inode, dentry); error: unlock_super(sb); @@ -658,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = { .getattr = fat_getattr, }; -static int msdos_fill_super(struct super_block *sb, void *data, int silent) +static void setup(struct super_block *sb) { - int res; - - lock_super(sb); - res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0); - if (res) { - unlock_super(sb); - return res; - } - + sb->s_d_op = &msdos_dentry_operations; sb->s_flags |= MS_NOATIME; - sb->s_root->d_op = &msdos_dentry_operations; - unlock_super(sb); - return 0; +} + +static int msdos_fill_super(struct super_block *sb, void *data, int silent) +{ + return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, + 0, setup); } static struct dentry *msdos_mount(struct file_system_type *fs_type, diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index b936703b892..f88f752babd 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry) static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + /* This is not negative dentry. Always valid. */ if (dentry->d_inode) return 1; @@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + /* * This is not negative dentry. Always valid. * @@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) } /* returns the length of a struct qstr, ignoring trailing dots */ -static unsigned int vfat_striptail_len(struct qstr *qstr) +static unsigned int __vfat_striptail_len(unsigned int len, const char *name) { - unsigned int len = qstr->len; - - while (len && qstr->name[len - 1] == '.') + while (len && name[len - 1] == '.') len--; return len; } +static unsigned int vfat_striptail_len(const struct qstr *qstr) +{ + return __vfat_striptail_len(qstr->len, qstr->name); +} + /* * Compute the hash for the vfat name corresponding to the dentry. * Note: if the name is invalid, we leave the hash code unchanged so * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hash(struct dentry *dentry, struct qstr *qstr) +static int vfat_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); return 0; @@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr) * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hashi(struct dentry *dentry, struct qstr *qstr) +static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { - struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; + struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; const unsigned char *name; unsigned int len; unsigned long hash; @@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr) /* * Case insensitive compare of two vfat names. */ -static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; + struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; unsigned int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ - alen = vfat_striptail_len(a); - blen = vfat_striptail_len(b); + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); if (alen == blen) { - if (nls_strnicmp(t, a->name, b->name, alen) == 0) + if (nls_strnicmp(t, name->name, str, alen) == 0) return 0; } return 1; @@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b) /* * Case sensitive compare of two vfat names. */ -static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { unsigned int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ - alen = vfat_striptail_len(a); - blen = vfat_striptail_len(b); + alen = vfat_striptail_len(name); + blen = __vfat_striptail_len(len, str); if (alen == blen) { - if (strncmp(a->name, b->name, alen) == 0) + if (strncmp(name->name, str, alen) == 0) return 0; } return 1; @@ -757,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: unlock_super(sb); - dentry->d_op = sb->s_root->d_op; dentry->d_time = dentry->d_parent->d_inode->i_version; dentry = d_splice_alias(inode, dentry); - if (dentry) { - dentry->d_op = sb->s_root->d_op; + if (dentry) dentry->d_time = dentry->d_parent->d_inode->i_version; - } return dentry; error: @@ -1051,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = { .getattr = fat_getattr, }; -static int vfat_fill_super(struct super_block *sb, void *data, int silent) +static void setup(struct super_block *sb) { - int res; - - lock_super(sb); - res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1); - if (res) { - unlock_super(sb); - return res; - } - if (MSDOS_SB(sb)->options.name_check != 's') - sb->s_root->d_op = &vfat_ci_dentry_ops; + sb->s_d_op = &vfat_ci_dentry_ops; else - sb->s_root->d_op = &vfat_dentry_ops; + sb->s_d_op = &vfat_dentry_ops; +} - unlock_super(sb); - return 0; +static int vfat_fill_super(struct super_block *sb, void *data, int silent) +{ + return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, + 1, setup); } static struct dentry *vfat_mount(struct file_system_type *fs_type, diff --git a/fs/file_table.c b/fs/file_table.c index c3dee381f1b..c3e89adf53c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed) struct files_struct *files = current->files; *fput_needed = 0; - if (likely((atomic_read(&files->count) == 1))) { + if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); } else { rcu_read_lock(); diff --git a/fs/filesystems.c b/fs/filesystems.c index 68ba492d8ee..751d6b255a1 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs) tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); + + synchronize_rcu(); + return -EINVAL; } diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 8c04eac5079..2ba6719ac61 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino) return ip; } +static void vxfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(vxfs_inode_cachep, inode->i_private); +} + /** * vxfs_evict_inode - remove inode from main memory * @ip: inode to discard. @@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip) { truncate_inode_pages(&ip->i_data, 0); end_writeback(ip); - kmem_cache_free(vxfs_inode_cachep, ip->i_private); + call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d06ccc953a..59c6e495678 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head) return list_entry(head, struct inode, i_wb_list); } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ +static void bdi_wakeup_flusher(struct backing_dev_info *bdi) { - trace_writeback_queue(bdi, work); - - spin_lock_bh(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); if (bdi->wb.task) { wake_up_process(bdi->wb.task); } else { @@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * The bdi thread isn't there, wake up the forker thread which * will create and run it. */ - trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (!bdi->wb.task) + trace_writeback_nothread(bdi, work); + bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); } static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, bool for_background) + bool range_cyclic) { struct wb_writeback_work *work; @@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; - work->for_background = for_background; bdi_queue_work(bdi, work); } @@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - __bdi_start_writeback(bdi, nr_pages, true, false); + __bdi_start_writeback(bdi, nr_pages, true); } /** @@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) * @bdi: the backing device to write from * * Description: - * This does WB_SYNC_NONE background writeback. The IO is only - * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. */ void bdi_start_background_writeback(struct backing_dev_info *bdi) { - __bdi_start_writeback(bdi, LONG_MAX, true, true); + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + spin_lock_bh(&bdi->wb_lock); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; + long write_chunk; struct inode *inode; if (wbc.for_kupdate) { @@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (wbc.sync_mode == WB_SYNC_NONE) + write_chunk = MAX_WRITEBACK_PAGES; + else + write_chunk = LONG_MAX; + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* @@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb, break; /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + + /* * For background writeout, stop when we are below the * background dirty threshold */ @@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; trace_wbc_writeback_start(&wbc, wb->bdi); @@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb, writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); - work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; /* * If we consumed everything, see if we have more @@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Did we write something? Try for more */ - if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + if (wbc.nr_to_write < write_chunk) continue; /* * Nothing written. Wait for some inode to @@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void) get_nr_dirty_inodes(); } +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh()) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; @@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, false); + __bdi_start_writeback(bdi, nr_pages, false); } rcu_read_unlock(); } @@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * super_block. The number of pages synced is returned. + * super_block. */ void sync_inodes_sb(struct super_block *sb) { @@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) EXPORT_SYMBOL(sync_inode); /** - * sync_inode - write an inode to disk + * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * - * Write an inode to disk and adjust it's dirty state after completion. + * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ diff --git a/fs/fs_struct.c b/fs/fs_struct.c index ed45a9cf5f3..78b519c1353 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -4,6 +4,19 @@ #include <linux/path.h> #include <linux/slab.h> #include <linux/fs_struct.h> +#include "internal.h" + +static inline void path_get_longterm(struct path *path) +{ + path_get(path); + mnt_make_longterm(path->mnt); +} + +static inline void path_put_longterm(struct path *path) +{ + mnt_make_shortterm(path->mnt); + path_put(path); +} /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -14,12 +27,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path) struct path old_root; spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); old_root = fs->root; fs->root = *path; - path_get(path); + path_get_longterm(path); + write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_root.dentry) - path_put(&old_root); + path_put_longterm(&old_root); } /* @@ -31,13 +46,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path) struct path old_pwd; spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; - path_get(path); + path_get_longterm(path); + write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_pwd.dentry) - path_put(&old_pwd); + path_put_longterm(&old_pwd); } void chroot_fs_refs(struct path *old_root, struct path *new_root) @@ -52,31 +69,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) fs = p->fs; if (fs) { spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); if (fs->root.dentry == old_root->dentry && fs->root.mnt == old_root->mnt) { - path_get(new_root); + path_get_longterm(new_root); fs->root = *new_root; count++; } if (fs->pwd.dentry == old_root->dentry && fs->pwd.mnt == old_root->mnt) { - path_get(new_root); + path_get_longterm(new_root); fs->pwd = *new_root; count++; } + write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); } task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); while (count--) - path_put(old_root); + path_put_longterm(old_root); } void free_fs_struct(struct fs_struct *fs) { - path_put(&fs->root); - path_put(&fs->pwd); + path_put_longterm(&fs->root); + path_put_longterm(&fs->pwd); kmem_cache_free(fs_cachep, fs); } @@ -88,8 +107,10 @@ void exit_fs(struct task_struct *tsk) int kill; task_lock(tsk); spin_lock(&fs->lock); + write_seqcount_begin(&fs->seq); tsk->fs = NULL; kill = !--fs->users; + write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); task_unlock(tsk); if (kill) @@ -105,8 +126,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) fs->users = 1; fs->in_exec = 0; spin_lock_init(&fs->lock); + seqcount_init(&fs->seq); fs->umask = old->umask; - get_fs_root_and_pwd(old, &fs->root, &fs->pwd); + + spin_lock(&old->lock); + fs->root = old->root; + path_get_longterm(&fs->root); + fs->pwd = old->pwd; + path_get_longterm(&fs->pwd); + spin_unlock(&old->lock); } return fs; } @@ -144,6 +172,7 @@ EXPORT_SYMBOL(current_umask); struct fs_struct init_fs = { .users = 1, .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), + .seq = SEQCNT_ZERO, .umask = 0022, }; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index b9f34eaede0..48a18f184d5 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ - if (object->n_ops > 0) { + if (object->n_ops > 1) { atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6e07696308d..cf8d28d1fba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) kill_fasync(&fc->fasync, SIGIO, POLL_IN); } +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup) +{ + forget->forget_one.nodeid = nodeid; + forget->forget_one.nlookup = nlookup; + + spin_lock(&fc->lock); + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_unlock(&fc->lock); +} + static void flush_bg_queue(struct fuse_conn *fc) { while (fc->active_background < fc->max_background && @@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) -{ - req->isreply = 0; - fuse_request_send_nowait(fc, req); -} - void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; @@ -896,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } +static int forget_pending(struct fuse_conn *fc) +{ + return fc->forget_list_head.next != NULL; +} + static int request_pending(struct fuse_conn *fc) { - return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); + return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || + forget_pending(fc); } /* Wait until a request is available on the pending list */ @@ -960,6 +974,120 @@ __releases(fc->lock) return err ? err : reqsize; } +static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, + unsigned max, + unsigned *countp) +{ + struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link **newhead = &head; + unsigned count; + + for (count = 0; *newhead != NULL && count < max; count++) + newhead = &(*newhead)->next; + + fc->forget_list_head.next = *newhead; + *newhead = NULL; + if (fc->forget_list_head.next == NULL) + fc->forget_list_tail = &fc->forget_list_head; + + if (countp != NULL) + *countp = count; + + return head; +} + +static int fuse_read_single_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + int err; + struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_in arg = { + .nlookup = forget->forget_one.nlookup, + }; + struct fuse_in_header ih = { + .opcode = FUSE_FORGET, + .nodeid = forget->forget_one.nodeid, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + spin_unlock(&fc->lock); + kfree(forget); + if (nbytes < ih.len) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_batch_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +__releases(fc->lock) +{ + int err; + unsigned max_forgets; + unsigned count; + struct fuse_forget_link *head; + struct fuse_batch_forget_in arg = { .count = 0 }; + struct fuse_in_header ih = { + .opcode = FUSE_BATCH_FORGET, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + if (nbytes < ih.len) { + spin_unlock(&fc->lock); + return -EINVAL; + } + + max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); + head = dequeue_forget(fc, max_forgets, &count); + spin_unlock(&fc->lock); + + arg.count = count; + ih.len += count * sizeof(struct fuse_forget_one); + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + + while (head) { + struct fuse_forget_link *forget = head; + + if (!err) { + err = fuse_copy_one(cs, &forget->forget_one, + sizeof(forget->forget_one)); + } + head = forget->next; + kfree(forget); + } + + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fc, cs, nbytes); + else + return fuse_read_batch_forget(fc, cs, nbytes); +} + /* * Read a single request into the userspace filesystem's buffer. This * function waits until a request is available, then removes it from @@ -998,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, return fuse_read_interrupt(fc, cs, nbytes, req); } + if (forget_pending(fc)) { + if (list_empty(&fc->pending) || fc->forget_batch-- > 0) + return fuse_read_forget(fc, cs, nbytes); + + if (fc->forget_batch <= -8) + fc->forget_batch = 16; + } + req = list_entry(fc->pending.next, struct fuse_req, list); req->state = FUSE_REQ_READING; list_move(&req->list, &fc->io); @@ -1090,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!fc) return -EPERM; - bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; @@ -1626,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, if (!fc) return -EPERM; - bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; @@ -1770,6 +1906,8 @@ __acquires(fc->lock) flush_bg_queue(fc); end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); + while (forget_pending(fc)) + kfree(dequeue_forget(fc, 1, NULL)); } /* diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c9627c95482..bfed8447ed8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -10,9 +10,9 @@ #include <linux/pagemap.h> #include <linux/file.h> -#include <linux/gfp.h> #include <linux/sched.h> #include <linux/namei.h> +#include <linux/slab.h> #if BITS_PER_LONG >= 64 static inline void fuse_dentry_settime(struct dentry *entry, u64 time) @@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) */ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { - struct inode *inode = entry->d_inode; + struct inode *inode; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = entry->d_inode; if (inode && is_bad_inode(inode)) return 0; else if (fuse_dentry_time(entry) < get_jiffies_64()) { @@ -165,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) struct fuse_entry_out outarg; struct fuse_conn *fc; struct fuse_req *req; - struct fuse_req *forget_req; + struct fuse_forget_link *forget; struct dentry *parent; u64 attr_version; @@ -178,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) if (IS_ERR(req)) return 0; - forget_req = fuse_get_req(fc); - if (IS_ERR(forget_req)) { + forget = fuse_alloc_forget(); + if (!forget) { fuse_put_request(fc, req); return 0; } @@ -199,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { - fuse_send_forget(fc, forget_req, - outarg.nodeid, 1); + fuse_queue_forget(fc, forget, outarg.nodeid, 1); return 0; } spin_lock(&fc->lock); fi->nlookup++; spin_unlock(&fc->lock); } - fuse_put_request(fc, forget_req); + kfree(forget); if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) return 0; @@ -259,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, { struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_req *req; - struct fuse_req *forget_req; + struct fuse_forget_link *forget; u64 attr_version; int err; @@ -273,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (IS_ERR(req)) goto out; - forget_req = fuse_get_req(fc); - err = PTR_ERR(forget_req); - if (IS_ERR(forget_req)) { + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) { fuse_put_request(fc, req); goto out; } @@ -301,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, attr_version); err = -ENOMEM; if (!*inode) { - fuse_send_forget(fc, forget_req, outarg->nodeid, 1); + fuse_queue_forget(fc, forget, outarg->nodeid, 1); goto out; } err = 0; out_put_forget: - fuse_put_request(fc, forget_req); + kfree(forget); out: return err; } @@ -347,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, } entry = newent ? newent : entry; - entry->d_op = &fuse_dentry_operations; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -374,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct inode *inode; struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_req *req; - struct fuse_req *forget_req; + struct fuse_forget_link *forget; struct fuse_create_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; @@ -388,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (flags & O_DIRECT) return -EINVAL; - forget_req = fuse_get_req(fc); - if (IS_ERR(forget_req)) - return PTR_ERR(forget_req); + forget = fuse_alloc_forget(); + if (!forget) + return -ENOMEM; req = fuse_get_req(fc); err = PTR_ERR(req); @@ -448,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(ff, flags); - fuse_send_forget(fc, forget_req, outentry.nodeid, 1); + fuse_queue_forget(fc, forget, outentry.nodeid, 1); return -ENOMEM; } - fuse_put_request(fc, forget_req); + kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_invalidate_attr(dir); @@ -469,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, out_put_request: fuse_put_request(fc, req); out_put_forget_req: - fuse_put_request(fc, forget_req); + kfree(forget); return err; } @@ -483,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, struct fuse_entry_out outarg; struct inode *inode; int err; - struct fuse_req *forget_req; + struct fuse_forget_link *forget; - forget_req = fuse_get_req(fc); - if (IS_ERR(forget_req)) { + forget = fuse_alloc_forget(); + if (!forget) { fuse_put_request(fc, req); - return PTR_ERR(forget_req); + return -ENOMEM; } memset(&outarg, 0, sizeof(outarg)); @@ -515,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, entry_attr_timeout(&outarg), 0); if (!inode) { - fuse_send_forget(fc, forget_req, outarg.nodeid, 1); + fuse_queue_forget(fc, forget, outarg.nodeid, 1); return -ENOMEM; } - fuse_put_request(fc, forget_req); + kfree(forget); if (S_ISDIR(inode->i_mode)) { struct dentry *alias; @@ -541,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, return 0; out_put_forget_req: - fuse_put_request(fc, forget_req); + kfree(forget); return err; } @@ -981,12 +983,15 @@ static int fuse_access(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask) +static int fuse_permission(struct inode *inode, int mask, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + if (!fuse_allow_task(fc, current)) return -EACCES; @@ -1001,7 +1006,7 @@ static int fuse_permission(struct inode *inode, int mask) } if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask, flags, NULL); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1009,7 +1014,8 @@ static int fuse_permission(struct inode *inode, int mask) if (err == -EACCES && !refreshed) { err = fuse_do_getattr(inode, NULL, NULL); if (!err) - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask, + flags, NULL); } /* Note: the opposite of the above test does not diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8b984a2cebb..95da1bc1c82 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1634,9 +1634,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, * and 64bit. Fortunately we can determine which structure the server * used from the size of the reply. */ -static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src, - size_t transferred, unsigned count, - bool is_compat) +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) { #ifdef CONFIG_COMPAT if (count * sizeof(struct compat_iovec) == transferred) { @@ -1680,6 +1680,42 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) return 0; } +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + /* * For ioctls, there is no generic way to determine how much memory * needs to be read and/or written. Furthermore, ioctls are allowed @@ -1740,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, struct fuse_ioctl_out outarg; struct fuse_req *req = NULL; struct page **pages = NULL; - struct page *iov_page = NULL; + struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; size_t in_size, out_size, transferred; int err; +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + /* assume all the iovs returned by client always fits in a page */ - BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); - iov_page = alloc_page(GFP_KERNEL); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -1760,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, * RETRY from server is not allowed. */ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { - struct iovec *iov = page_address(iov_page); + struct iovec *iov = iov_page; iov->iov_base = (void __user *)arg; iov->iov_len = _IOC_SIZE(cmd); @@ -1841,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* did it ask for retry? */ if (outarg.flags & FUSE_IOCTL_RETRY) { - char *vaddr; + void *vaddr; /* no retry if in restricted mode */ err = -EIO; @@ -1862,14 +1905,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; vaddr = kmap_atomic(pages[0], KM_USER0); - err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr, + err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr, KM_USER0); if (err) goto out; - in_iov = page_address(iov_page); + in_iov = iov_page; out_iov = in_iov + in_iovs; err = fuse_verify_ioctl_iov(in_iov, in_iovs); @@ -1891,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, out: if (req) fuse_put_request(fc, req); - if (iov_page) - __free_page(iov_page); + free_page((unsigned long) iov_page); while (num_pages) __free_page(pages[--num_pages]); kfree(pages); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 57d4a3a0f10..ae5744a2f9e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -53,6 +53,12 @@ extern struct mutex fuse_mutex; extern unsigned max_user_bgreq; extern unsigned max_user_congthresh; +/* One forget request */ +struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -66,7 +72,7 @@ struct fuse_inode { u64 nlookup; /** The request used for sending the FORGET message */ - struct fuse_req *forget_req; + struct fuse_forget_link *forget; /** Time in jiffies until the file attributes are valid */ u64 i_time; @@ -255,7 +261,6 @@ struct fuse_req { /** Data for asynchronous requests */ union { - struct fuse_forget_in forget_in; struct { struct fuse_release_in in; struct path path; @@ -369,6 +374,13 @@ struct fuse_conn { /** Pending interrupts */ struct list_head interrupts; + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + /** Flag indicating if connection is blocked. This will be the case before the INIT reply is received, and if there are too many outstading backgrounds requests */ @@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, /** * Send FORGET command */ -void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, - u64 nodeid, u64 nlookup); +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); /** * Initialize READ or READDIR request @@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); /** - * Send a request with no reply - */ -void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); - -/** * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cfce3ad86a9..9e3f68cc1bd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -71,6 +71,11 @@ struct fuse_mount_data { unsigned blksize; }; +struct fuse_forget_link *fuse_alloc_forget() +{ + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); +} + static struct inode *fuse_alloc_inode(struct super_block *sb) { struct inode *inode; @@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); init_waitqueue_head(&fi->page_waitq); - fi->forget_req = fuse_request_alloc(); - if (!fi->forget_req) { + fi->forget = fuse_alloc_forget(); + if (!fi->forget) { kmem_cache_free(fuse_inode_cachep, inode); return NULL; } @@ -99,27 +104,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return inode; } -static void fuse_destroy_inode(struct inode *inode) +static void fuse_i_callback(struct rcu_head *head) { - struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!list_empty(&fi->write_files)); - BUG_ON(!list_empty(&fi->queued_writes)); - if (fi->forget_req) - fuse_request_free(fi->forget_req); + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fuse_inode_cachep, inode); } -void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, - u64 nodeid, u64 nlookup) +static void fuse_destroy_inode(struct inode *inode) { - struct fuse_forget_in *inarg = &req->misc.forget_in; - inarg->nlookup = nlookup; - req->in.h.opcode = FUSE_FORGET; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_forget_in); - req->in.args[0].value = inarg; - fuse_request_send_noreply(fc, req); + struct fuse_inode *fi = get_fuse_inode(inode); + BUG_ON(!list_empty(&fi->write_files)); + BUG_ON(!list_empty(&fi->queued_writes)); + kfree(fi->forget); + call_rcu(&inode->i_rcu, fuse_i_callback); } static void fuse_evict_inode(struct inode *inode) @@ -129,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode) if (inode->i_sb->s_flags & MS_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup); - fi->forget_req = NULL; + fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); + fi->forget = NULL; } } @@ -534,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc) INIT_LIST_HEAD(&fc->interrupts); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); + fc->forget_list_tail = &fc->forget_list_head; atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; @@ -618,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb, goto out_iput; entry = d_obtain_alias(inode); - if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) { - entry->d_op = &fuse_dentry_operations; + if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) fuse_invalidate_entry_cache(entry); - } return entry; @@ -720,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child) } parent = d_obtain_alias(inode); - if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) { - parent->d_op = &fuse_dentry_operations; + if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) fuse_invalidate_entry_cache(parent); - } return parent; } @@ -990,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) iput(root); goto err_put_conn; } + /* only now - we want root dentry with NULL ->d_op */ + sb->s_d_op = &fuse_dentry_operations; init_req = fuse_request_alloc(); if (!init_req) diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 6bc9e3a5a69..06c48a89183 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode) } int -generic_check_acl(struct inode *inode, int mask) +generic_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + } else { + struct posix_acl *acl; + + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } } return -EAGAIN; } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 48171f4c943..7118f1a780a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) * Returns: errno */ -int gfs2_check_acl(struct inode *inode, int mask) +int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b522b0cb39e..a93907c8159 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -16,7 +16,7 @@ #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES 25 -extern int gfs2_check_acl(struct inode *inode, int mask); +extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern const struct xattr_handler gfs2_xattr_system_handler; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5476c066d4e..3c4039d5eef 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, int metadata; unsigned int revokes = 0; int x; - int error; + int error = 0; if (!*top) sm->sm_first = 0; @@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + else if (!sdp->sd_rgrps) + error = gfs2_ri_update(ip); + if (error) return error; @@ -879,7 +883,8 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 6798755b385..4a456338b87 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -11,6 +11,7 @@ #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> +#include <linux/namei.h> #include <linux/crc32.h> #include "gfs2.h" @@ -34,15 +35,23 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) { - struct dentry *parent = dget_parent(dentry); - struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); - struct gfs2_inode *dip = GFS2_I(parent->d_inode); - struct inode *inode = dentry->d_inode; + struct dentry *parent; + struct gfs2_sbd *sdp; + struct gfs2_inode *dip; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error; int had_lock = 0; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(parent->d_inode); + dip = GFS2_I(parent->d_inode); + inode = dentry->d_inode; + if (inode) { if (is_bad_inode(inode)) goto invalid; @@ -100,13 +109,14 @@ fail: return 0; } -static int gfs2_dhash(struct dentry *dentry, struct qstr *str) +static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, + struct qstr *str) { str->hash = gfs2_disk_hash(str->name, str->len); return 0; } -static int gfs2_dentry_delete(struct dentry *dentry) +static int gfs2_dentry_delete(const struct dentry *dentry) { struct gfs2_inode *ginode; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 5ab3839dfcb..9023db8184f 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -126,12 +126,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - struct dentry *dentry; - - dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); - if (!IS_ERR(dentry)) - dentry->d_op = &gfs2_dops; - return dentry; + return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); } static struct dentry *gfs2_get_dentry(struct super_block *sb, @@ -139,7 +134,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, { struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - struct dentry *dentry; inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { @@ -156,10 +150,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, return ERR_CAST(inode); out_inode: - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) - dentry->d_op = &gfs2_dops; - return dentry; + return d_obtain_alias(inode); } static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index aa996471ec5..7cfdcb91336 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -19,6 +19,8 @@ #include <linux/fs.h> #include <linux/gfs2_ondisk.h> #include <linux/ext2_fs.h> +#include <linux/falloc.h> +#include <linux/swap.h> #include <linux/crc32.h> #include <linux/writeback.h> #include <asm/uaccess.h> @@ -241,7 +243,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) !capable(CAP_LINUX_IMMUTABLE)) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(inode, MAY_WRITE); + error = gfs2_permission(inode, MAY_WRITE, 0); if (error) goto out; } @@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } +static void empty_write_end(struct page *page, unsigned from, + unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + + page_zero_new_buffers(page, from, to); + flush_dcache_page(page); + mark_page_accessed(page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); +} + +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + unsigned start, end, next; + struct buffer_head *bh, *head; + int error; + + if (!page_has_buffers(page)) { + error = __block_write_begin(page, from, to - from, gfs2_block_map); + if (unlikely(error)) + return error; + + empty_write_end(page, from, to); + return 0; + } + + bh = head = page_buffers(page); + next = end = 0; + while (next < from) { + next += bh->b_size; + bh = bh->b_this_page; + } + start = next; + do { + next += bh->b_size; + if (buffer_mapped(bh)) { + if (end) { + error = __block_write_begin(page, start, end - start, + gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + end = 0; + } + start = next; + } + else + end = next; + bh = bh->b_this_page; + } while (next < to); + + if (end) { + error = __block_write_begin(page, start, end - start, gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + } + + return 0; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + u64 start = offset >> PAGE_CACHE_SHIFT; + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pgoff_t curr; + struct page *page; + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; + unsigned int from, to; + + if (!end_offset) + end_offset = PAGE_CACHE_SIZE; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + curr = start; + offset = start << PAGE_CACHE_SHIFT; + from = start_offset; + to = PAGE_CACHE_SIZE; + while (curr <= end) { + page = grab_cache_page_write_begin(inode->i_mapping, curr, + AOP_FLAG_NOFS); + if (unlikely(!page)) { + error = -ENOMEM; + goto out; + } + + if (curr == end) + to = end_offset; + error = write_empty_blocks(page, from, to); + if (!error && offset + to > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + i_size_write(inode, offset + to); + } + unlock_page(page); + page_cache_release(page); + if (error) + goto out; + curr++; + offset += PAGE_CACHE_SIZE; + from = 0; + } + + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + + brelse(dibh); + +out: + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + struct gfs2_alloc *al; + int error; + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + offset = (offset >> sdp->sd_sb.sb_bsize_shift) << + sdp->sd_sb.sb_bsize_shift; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + if (!gfs2_write_alloc_required(ip, offset, len)) + goto out_unlock; + + while (len > 0) { + if (len < bytes) + bytes = len; + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(al); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = { .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .setlease = gfs2_setlease, + .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops = { @@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = { .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .setlease = generic_setlease, + .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops_nolock = { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f92c1770416..08a8beb152e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -541,21 +541,6 @@ out_locked: spin_unlock(&gl->gl_spin); } -static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int req_state, - unsigned int flags) -{ - int ret = LM_OUT_ERROR; - - if (!sdp->sd_lockstruct.ls_ops->lm_lock) - return req_state == LM_ST_UNLOCKED ? 0 : req_state; - - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, - req_state, flags); - return ret; -} - /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state @@ -575,13 +560,14 @@ __acquires(&gl->gl_spin) lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); - BUG_ON(gl->gl_state == target); - BUG_ON(gl->gl_state == gl->gl_target); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); do_error(gl, 0); /* Fail queued try locks */ } + gl->gl_req = target; spin_unlock(&gl->gl_spin); if (glops->go_xmote_th) glops->go_xmote_th(gl); @@ -594,15 +580,17 @@ __acquires(&gl->gl_spin) gl->gl_state == LM_ST_DEFERRED) && !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) lck_flags |= LM_FLAG_TRY_1CB; - ret = gfs2_lm_lock(sdp, gl, target, lck_flags); - if (!(ret & LM_OUT_ASYNC)) { - finish_xmote(gl, ret); + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + GLOCK_BUG_ON(gl, ret); + } else { /* lock_nolock */ + finish_xmote(gl, target); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); - } else { - GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC); } + spin_lock(&gl->gl_spin); } @@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); + if (seq) { struct gfs2_glock_iter *gi = seq->private; vsprintf(gi->string, fmt, args); seq_printf(seq, gi->string); } else { - printk(KERN_ERR " "); - vprintk(fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR " %pV", &vaf); } + va_end(args); } @@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl) * @gl: Pointer to the glock * @ret: The return value from the dlm * + * The gl_reply field is under the gl_spin lock so that it is ok + * to use a bitfield shared with other glock state fields. */ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + spin_lock(&gl->gl_spin); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { - spin_lock(&gl->gl_spin); if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); return; } - spin_unlock(&gl->gl_spin); } + + spin_unlock(&gl->gl_spin); set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + smp_wmb(); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); @@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; - char buffer[KSYM_SYMBOL_LEN]; char flags_buf[32]; - sprint_symbol(buffer, gh->gh_ip); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); - gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", - state2str(gh->gh_state), - hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", buffer); + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", + (void *)gh->gh_ip); return 0; } @@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void) } #endif - glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZEABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | - WQ_FREEZEABLE, 0); + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", + WQ_MEM_RECLAIM | WQ_FREEZEABLE, + 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); return PTR_ERR(gfs2_delete_workqueue); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index db1c26d6d22..691851ceb61 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -87,11 +87,10 @@ enum { #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 -#define GL_ATIME 0x00000200 #define GL_NOCACHE 0x00000400 /* - * lm_lock() and lm_async_cb return flags + * lm_async_cb return flags * * LM_OUT_ST_MASK * Masks the lower two bits of lock state in the returned value. @@ -99,15 +98,11 @@ enum { * LM_OUT_CANCELED * The lock request was canceled. * - * LM_OUT_ASYNC - * The result of the request will be returned in an LM_CB_ASYNC callback. - * */ #define LM_OUT_ST_MASK 0x00000003 #define LM_OUT_CANCELED 0x00000008 -#define LM_OUT_ASYNC 0x00000080 -#define LM_OUT_ERROR 0x00000100 +#define LM_OUT_ERROR 0x00000004 /* * lm_recovery_done() messages @@ -124,25 +119,12 @@ struct lm_lockops { void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); - unsigned int (*lm_lock) (struct gfs2_glock *gl, - unsigned int req_state, unsigned int flags); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); const match_table_t *lm_tokens; }; -#define LM_FLAG_TRY 0x00000001 -#define LM_FLAG_TRY_1CB 0x00000002 -#define LM_FLAG_NOEXP 0x00000004 -#define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 - -#define GL_ASYNC 0x00000040 -#define GL_EXACT 0x00000080 -#define GL_SKIP 0x00000100 -#define GL_NOCACHE 0x00000400 - -#define GLR_TRYFAILED 13 - extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { @@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); + +__attribute__ ((format(printf, 2, 3))) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0d149dcc04e..263561bf1a5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - flush_workqueue(gfs2_delete_workqueue); gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 764fbb49efc..a79790c0627 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -11,6 +11,7 @@ #define __INCORE_DOT_H__ #include <linux/fs.h> +#include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/dlm.h> #include <linux/buffer_head.h> @@ -207,12 +208,14 @@ struct gfs2_glock { spinlock_t gl_spin; - unsigned int gl_state; - unsigned int gl_target; - unsigned int gl_reply; + /* State fields protected by gl_spin */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ + unsigned int gl_hash; - unsigned int gl_req; - unsigned int gl_demote_state; /* state requested by remote node */ unsigned long gl_demote_time; /* time of first demote request */ struct list_head gl_holders; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1213f7f921..7aa7d4f8984 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) } /** - * GFS2 lookup code fills in vfs inode contents based on info obtained - * from directory entry inside gfs2_inode_lookup(). This has caused issues - * with NFS code path since its get_dentry routine doesn't have the relevant - * directory entry when gfs2_inode_lookup() is invoked. Part of the code - * segment inside gfs2_inode_lookup code needs to get moved around. + * gfs2_set_iop - Sets inode operations + * @inode: The inode with correct i_mode filled in * - * Clears I_NEW as well. - **/ + * GFS2 lookup code fills in vfs inode contents based on info obtained + * from directory entry inside gfs2_inode_lookup(). + */ -void gfs2_set_iop(struct inode *inode) +static void gfs2_set_iop(struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(inode); umode_t mode = inode->i_mode; @@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode) inode->i_op = &gfs2_file_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); } - - unlock_new_inode(inode); } /** @@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode) * Returns: A VFS inode, or an error */ -struct inode *gfs2_inode_lookup(struct super_block *sb, - unsigned int type, - u64 no_addr, - u64 no_formal_ino) +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, + u64 no_addr, u64 no_formal_ino) { struct inode *inode; struct gfs2_inode *ip; @@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_iopen; - ip->i_iopen_gh.gh_gl->gl_object = ip; + ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); io_gl = NULL; - if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) - goto gfs2_nfsbypass; - - inode->i_mode = DT2IF(type); - - /* - * We must read the inode in order to work out its type in - * this case. Note that this doesn't happen often as we normally - * know the type beforehand. This code path only occurs during - * unlinked inode recovery (where it is safe to do this glock, - * which is not true in the general case). - */ if (type == DT_UNKNOWN) { - struct gfs2_holder gh; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (unlikely(error)) - goto fail_glock; - /* Inode is now uptodate */ - gfs2_glock_dq_uninit(&gh); + /* Inode glock must be locked already */ + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_refresh; + } else { + inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); + unlock_new_inode(inode); } -gfs2_nfsbypass: return inode; -fail_glock: - gfs2_glock_dq(&ip->i_iopen_gh); + +fail_refresh: + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: if (io_gl) gfs2_glock_put(io_gl); fail_put: - if (inode->i_state & I_NEW) - ip->i_gl->gl_object = NULL; + ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); fail: - if (inode->i_state & I_NEW) - iget_failed(inode); - else - iput(inode); + iget_failed(inode); return ERR_PTR(error); } @@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, if (IS_ERR(inode)) goto fail; - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) - goto fail_iput; - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - /* Two extra checks for NFS only */ if (no_formal_ino) { error = -ESTALE; @@ -509,7 +481,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(dir, MAY_EXEC); + error = gfs2_permission(dir, MAY_EXEC, 0); if (error) goto out; } @@ -539,7 +511,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); if (error) return error; @@ -916,17 +888,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); - - gfs2_assert_warn(GFS2_SB(inode), !error); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index d8499fadcc5..3e00a66e7cb 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -96,7 +96,6 @@ err: return -EIO; } -extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, @@ -113,7 +112,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, extern struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); -extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 1c09425b45f..6e493aee28f 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, return lkf; } -static unsigned int gdlm_lock(struct gfs2_glock *gl, - unsigned int req_state, unsigned int flags) +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; - int error; int req; u32 lkf; - gl->gl_req = req_state; req = make_mode(req_state); lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); @@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, * Submit the actual lock request. */ - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); - if (error == -EAGAIN) - return 0; - if (error) - return LM_OUT_ERROR; - return LM_OUT_ASYNC; + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); } static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3eb1393f7b8..777927ce6f7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, iput(inode); return -ENOMEM; } - dentry->d_op = &gfs2_dops; *dptr = dentry; return 0; } @@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; sb->s_export_op = &gfs2_export_ops; sb->s_xattr = gfs2_xattr_handlers; sb->s_qcop = &gfs2_quotactl_ops; @@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error; struct gfs2_args args; struct gfs2_sbd *sdp; @@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, goto error_bdev; if (s->s_root) - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; @@ -1342,7 +1342,7 @@ error_super: deactivate_locked_super(s); return ERR_PTR(error); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); return ERR_PTR(error); } diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 12cbea7502c..d8b26ac2e20 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -18,8 +18,6 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/fiemap.h> -#include <linux/swap.h> -#include <linux/falloc.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, { struct inode *inode = NULL; - dentry->d_op = &gfs2_dops; - inode = gfs2_lookupi(dir, &dentry->d_name, 0); if (inode && IS_ERR(inode)) return ERR_CAST(inode); @@ -166,7 +162,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_child; - error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0); if (error) goto out_gunlock; @@ -289,7 +285,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); if (error) return error; @@ -822,7 +818,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0); if (error) goto out_gunlock; @@ -857,7 +853,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0); if (error) goto out_gunlock; } @@ -1041,13 +1037,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) * Returns: errno */ -int gfs2_permission(struct inode *inode, int mask) +int gfs2_permission(struct inode *inode, int mask, unsigned int flags) { - struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inode *ip; struct gfs2_holder i_gh; int error; int unlock = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) @@ -1058,7 +1058,7 @@ int gfs2_permission(struct inode *inode, int mask) if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EACCES; else - error = generic_permission(inode, mask, gfs2_check_acl); + error = generic_permission(inode, mask, flags, gfs2_check_acl); if (unlock) gfs2_glock_dq_uninit(&i_gh); @@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh; u32 ouid, ogid, nuid, ngid; int error; @@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_gunlock_q; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_setattr_simple(ip, attr); if (error) goto out_end_trans; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = vmtruncate(inode, attr->ia_size); - gfs2_assert_warn(sdp, !error); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); gfs2_quota_change(ip, -blocks, ouid, ogid); @@ -1271,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return ret; } -static void empty_write_end(struct page *page, unsigned from, - unsigned to) -{ - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - - page_zero_new_buffers(page, from, to); - flush_dcache_page(page); - mark_page_accessed(page); - - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - block_commit_write(page, from, to); -} - - -static int write_empty_blocks(struct page *page, unsigned from, unsigned to) -{ - unsigned start, end, next; - struct buffer_head *bh, *head; - int error; - - if (!page_has_buffers(page)) { - error = __block_write_begin(page, from, to - from, gfs2_block_map); - if (unlikely(error)) - return error; - - empty_write_end(page, from, to); - return 0; - } - - bh = head = page_buffers(page); - next = end = 0; - while (next < from) { - next += bh->b_size; - bh = bh->b_this_page; - } - start = next; - do { - next += bh->b_size; - if (buffer_mapped(bh)) { - if (end) { - error = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(error)) - return error; - empty_write_end(page, start, end); - end = 0; - } - start = next; - } - else - end = next; - bh = bh->b_this_page; - } while (next < to); - - if (end) { - error = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(error)) - return error; - empty_write_end(page, start, end); - } - - return 0; -} - -static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, - int mode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *dibh; - int error; - u64 start = offset >> PAGE_CACHE_SHIFT; - unsigned int start_offset = offset & ~PAGE_CACHE_MASK; - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; - pgoff_t curr; - struct page *page; - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; - unsigned int from, to; - - if (!end_offset) - end_offset = PAGE_CACHE_SIZE; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(error)) - goto out; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (unlikely(error)) - goto out; - } - - curr = start; - offset = start << PAGE_CACHE_SHIFT; - from = start_offset; - to = PAGE_CACHE_SIZE; - while (curr <= end) { - page = grab_cache_page_write_begin(inode->i_mapping, curr, - AOP_FLAG_NOFS); - if (unlikely(!page)) { - error = -ENOMEM; - goto out; - } - - if (curr == end) - to = end_offset; - error = write_empty_blocks(page, from, to); - if (!error && offset + to > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - i_size_write(inode, offset + to); - } - unlock_page(page); - page_cache_release(page); - if (error) - goto out; - curr++; - offset += PAGE_CACHE_SIZE; - from = 0; - } - - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - - brelse(dibh); - -out: - return error; -} - -static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, - unsigned int *data_blocks, unsigned int *ind_blocks) -{ - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; - unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); - - for (tmp = max_data; tmp > sdp->sd_diptrs;) { - tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); - max_data -= tmp; - } - /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, - so it might end up with fewer data blocks */ - if (max_data <= *data_blocks) - return; - *data_blocks = max_data; - *ind_blocks = max_blocks - max_data; - *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; - if (*len > max) { - *len = max; - gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); - } -} - -static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, - loff_t len) -{ - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_inode *ip = GFS2_I(inode); - unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes; - struct gfs2_alloc *al; - int error; - loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; - next = (next + 1) << sdp->sd_sb.sb_bsize_shift; - - offset = (offset >> sdp->sd_sb.sb_bsize_shift) << - sdp->sd_sb.sb_bsize_shift; - - len = next - offset; - bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; - if (!bytes) - bytes = UINT_MAX; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (unlikely(error)) - goto out_uninit; - - if (!gfs2_write_alloc_required(ip, offset, len)) - goto out_unlock; - - while (len > 0) { - if (len < bytes) - bytes = len; - al = gfs2_alloc_get(ip); - if (!al) { - error = -ENOMEM; - goto out_unlock; - } - - error = gfs2_quota_lock_check(ip); - if (error) - goto out_alloc_put; - -retry: - gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); - if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - goto retry; - } - goto out_qunlock; - } - max_bytes = bytes; - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - - rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(al); - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - - error = gfs2_trans_begin(sdp, rblocks, - PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); - if (error) - goto out_trans_fail; - - error = fallocate_chunk(inode, offset, max_bytes, mode); - gfs2_trans_end(sdp); - - if (error) - goto out_trans_fail; - - len -= max_bytes; - offset += max_bytes; - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - } - goto out_unlock; - -out_trans_fail: - gfs2_inplace_release(ip); -out_qunlock: - gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); -out_unlock: - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - - static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1572,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, - .fallocate = gfs2_fallocate, .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f606baf9ba7..a689901963d 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } + if (fdq->d_fieldmask & FS_DQ_BCOUNT) { + qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = qp->qu_value; + } } /* Write the quota into the quota file on disk */ @@ -1509,7 +1513,7 @@ out: } /* GFS2 only supports a subset of the XFS fields */ -#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) +#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, struct fs_disk_quota *fdq) @@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if ((fdq->d_fieldmask & FS_DQ_BSOFT) && ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; + if ((fdq->d_fieldmask & FS_DQ_BHARD) && ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; + + if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && + ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= FS_DQ_BCOUNT; + if (fdq->d_fieldmask == 0) goto out_i; @@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; - diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 33c8407b876..7293ea27020 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, * Returns: 0 on successful update, error code otherwise */ -static int gfs2_ri_update(struct gfs2_inode *ip) +int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; @@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip) } /** - * gfs2_ri_update_special - Pull in a new resource index from the disk - * - * This is a special version that's safe to call from gfs2_inplace_reserve_i. - * In this case we know that we don't have any resource groups in memory yet. - * - * @ip: pointer to the rindex inode - * - * Returns: 0 on successful update, error code otherwise - */ -static int gfs2_ri_update_special(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct inode *inode = &ip->i_inode; - struct file_ra_state ra_state; - struct gfs2_rgrpd *rgd; - unsigned int max_data = 0; - int error; - - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { - /* Ignore partials */ - if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - i_size_read(inode)) - break; - error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } - list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) - if (rgd->rd_data > max_data) - max_data = rgd->rd_data; - sdp->sd_max_rg_data = max_data; - - sdp->sd_rindex_uptodate = 1; - return 0; -} - -/** * gfs2_rindex_hold - Grab a lock on the rindex * @sdp: The GFS2 superblock * @ri_gh: the glock holder @@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, error = gfs2_rindex_hold(sdp, &al->al_ri_gh); else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update_special(ip); + error = gfs2_ri_update(ip); if (error) return error; } +try_again: do { error = get_local_rgrp(ip, &last_unlinked); /* If there is no space, flushing the log may release some */ - if (error) + if (error) { + if (ip == GFS2_I(sdp->sd_rindex) && + !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + goto try_again; + } gfs2_log_flush(sdp, NULL); + } } while (error && tries++ < 3); if (error) { diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 0e35c0466f9..50c2bb04369 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, extern void gfs2_inplace_release(struct gfs2_inode *ip); +extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2b2c4997430..ec73ed70bae 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode) if (error) goto out_truncate; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); @@ -1405,11 +1406,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) return &ip->i_inode; } -static void gfs2_destroy_inode(struct inode *inode) +static void gfs2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(gfs2_inode_cachep, inode); } +static void gfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, gfs2_i_callback); +} + const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .destroy_inode = gfs2_destroy_inode, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 30b58f07c8a..439b61c0326 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1296,10 +1296,8 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_location el; - struct buffer_head *dibh; int error; error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); @@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (error) return error; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_trans_end; - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = vmtruncate(inode, attr->ia_size); - gfs2_assert_warn(GFS2_SB(inode), !error); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - -out_trans_end: + error = gfs2_setattr_simple(ip, attr); gfs2_trans_end(sdp); return error; } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 2b3b8611b41..afa66aaa223 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int res; - dentry->d_op = &hfs_dentry_operations; - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index c8cffb81e84..ad97c2d5828 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -213,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; -extern int hfs_hash_dentry(struct dentry *, struct qstr *); +extern int hfs_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); -extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); +extern int hfs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); /* trans.c */ extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 927a5af7942..495a976a3cc 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -51,7 +51,8 @@ static unsigned char caseorder[256] = { /* * Hash a string to an integer in a case-independent way */ -int hfs_hash_dentry(struct dentry *dentry, struct qstr *this) +int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *this) { const unsigned char *name = this->name; unsigned int hash, len = this->len; @@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1, * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ -int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) +int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; - int len; - len = s1->len; if (len >= HFS_NAMELEN) { - if (s2->len < HFS_NAMELEN) + if (name->len < HFS_NAMELEN) return 1; len = HFS_NAMELEN; - } else if (len != s2->len) + } else if (len != name->len) return 1; - n1 = s1->name; - n2 = s2->name; + n1 = str; + n2 = name->name; while (len--) { if (caseorder[*n1++] != caseorder[*n2++]) return 1; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4824c27cebb..1b55f704fb2 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) return i ? &i->vfs_inode : NULL; } -static void hfs_destroy_inode(struct inode *inode) +static void hfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } +static void hfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfs_i_callback); +} + static const struct super_operations hfs_super_operations = { .alloc_inode = hfs_alloc_inode, .destroy_inode = hfs_destroy_inode, @@ -422,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) if (!root_inode) goto bail_no_root; + sb->s_d_op = &hfs_dentry_operations; res = -ENOMEM; sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) goto bail_iput; - sb->s_root->d_op = &hfs_dentry_operations; - /* everything's okay */ return 0; diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 7478f5c219a..19cf291eb91 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -8,15 +8,20 @@ * This file contains the code to do various system dependent things. */ +#include <linux/namei.h> #include "hfs_fs.h" /* dentry case-handling: just lowercase everything */ static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode; int diff; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; if(!inode) return 1; diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index d182438c7ae..5d799c13205 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); + dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); mutex_lock(&tree->tree_lock); return 0; } @@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); + dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index ad57f5991eb..1cad80c789c 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -15,7 +15,8 @@ #define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8) -int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) +int hfsplus_block_allocate(struct super_block *sb, u32 size, + u32 offset, u32 *max) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct page *page; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 29da6574ba7..1c42cc5b899 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) { __be16 data; - // optimize later... + /* TODO: optimize later... */ hfs_bnode_read(node, &data, off, 2); return be16_to_cpu(data); } @@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) { u8 data; - // optimize later... + /* TODO: optimize later... */ hfs_bnode_read(node, &data, off, 1); return data; } @@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) { __be16 v = cpu_to_be16(data); - // optimize later... + /* TODO: optimize later... */ hfs_bnode_write(node, &v, off, 2); } @@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len); + memmove(kmap(*dst_page) + src, + kmap(*src_page) + src, len); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); @@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min(len, (int)PAGE_CACHE_SIZE - src); - memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l); + memmove(kmap(*dst_page) + src, + kmap(*src_page) + src, l); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); while ((len -= l) != 0) { l = min(len, (int)PAGE_CACHE_SIZE); - memmove(kmap(*++dst_page), kmap(*++src_page), l); + memmove(kmap(*++dst_page), + kmap(*++src_page), l); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); @@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) do { src_ptr = kmap(*src_page) + src; dst_ptr = kmap(*dst_page) + dst; - if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) { + if (PAGE_CACHE_SIZE - src < + PAGE_CACHE_SIZE - dst) { l = PAGE_CACHE_SIZE - src; src = 0; dst += l; @@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node) return; tmp->next = node->next; cnid = cpu_to_be32(tmp->next); - hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4); + hfs_bnode_write(tmp, &cnid, + offsetof(struct hfs_bnode_desc, next), 4); hfs_bnode_put(tmp); } else if (node->type == HFS_NODE_LEAF) tree->leaf_head = node->next; @@ -351,15 +356,15 @@ void hfs_bnode_unlink(struct hfs_bnode *node) return; tmp->prev = node->prev; cnid = cpu_to_be32(tmp->prev); - hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4); + hfs_bnode_write(tmp, &cnid, + offsetof(struct hfs_bnode_desc, prev), 4); hfs_bnode_put(tmp); } else if (node->type == HFS_NODE_LEAF) tree->leaf_tail = node->prev; - // move down? - if (!node->prev && !node->next) { - printk(KERN_DEBUG "hfs_btree_del_level\n"); - } + /* move down? */ + if (!node->prev && !node->next) + dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -379,16 +384,16 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + printk(KERN_ERR "hfs: request for non-existent node " + "%d in B*Tree\n", + cnid); return NULL; } for (node = tree->node_hash[hfs_bnode_hash(cnid)]; - node; node = node->next_hash) { - if (node->this == cnid) { + node; node = node->next_hash) + if (node->this == cnid) return node; - } - } return NULL; } @@ -402,7 +407,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + printk(KERN_ERR "hfs: request for non-existent node " + "%d in B*Tree\n", + cnid); return NULL; } @@ -429,7 +436,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) } else { spin_unlock(&tree->hash_lock); kfree(node); - wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); + wait_event(node2->lock_wq, + !test_bit(HFS_BNODE_NEW, &node2->flags)); return node2; } spin_unlock(&tree->hash_lock); @@ -483,7 +491,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (node) { hfs_bnode_get(node); spin_unlock(&tree->hash_lock); - wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags)); + wait_event(node->lock_wq, + !test_bit(HFS_BNODE_NEW, &node->flags)); if (test_bit(HFS_BNODE_ERROR, &node->flags)) goto node_error; return node; @@ -497,7 +506,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); @@ -556,11 +566,13 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { - //int i; +#if 0 + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); +#endif kfree(node); } @@ -607,7 +619,8 @@ void hfs_bnode_get(struct hfs_bnode *node) if (node) { atomic_inc(&node->refcnt); dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); } } @@ -619,7 +632,8 @@ void hfs_bnode_put(struct hfs_bnode *node) int i; dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 2f39d05443e..2312de34bd4 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -39,7 +39,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { retval = node->tree->max_key_len + 2; } else { - recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); + recoff = hfs_bnode_read_u16(node, + node->tree->node_size - (rec + 1) * 2); if (!recoff) return 0; @@ -84,7 +85,8 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); + dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) panic("not enough room!\n"); @@ -99,7 +101,9 @@ again: } node->num_recs++; /* write new last offset */ - hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); + hfs_bnode_write_u16(node, + offsetof(struct hfs_bnode_desc, num_recs), + node->num_recs); hfs_bnode_write_u16(node, end_rec_off, end_off + size); data_off = end_off; data_rec_off = end_rec_off + 2; @@ -151,7 +155,8 @@ skip: if (tree->attributes & HFS_TREE_VARIDXKEYS) key_len = be16_to_cpu(fd->search_key->key_len) + 2; else { - fd->search_key->key_len = cpu_to_be16(tree->max_key_len); + fd->search_key->key_len = + cpu_to_be16(tree->max_key_len); key_len = tree->max_key_len + 2; } goto again; @@ -180,7 +185,8 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); + dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); if (!node->parent) @@ -194,7 +200,9 @@ again: __hfs_brec_find(node, fd); goto again; } - hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); + hfs_bnode_write_u16(node, + offsetof(struct hfs_bnode_desc, num_recs), + node->num_recs); if (rec_off == end_off) goto skip; @@ -364,7 +372,8 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); + dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; @@ -375,7 +384,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - printk(KERN_DEBUG "hfs: splitting index node...\n"); + dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) @@ -383,7 +392,8 @@ again: parent = fd->bnode; rec = fd->record; rec_off = tree->node_size - (rec + 2) * 2; - end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; + end_rec_off = tree->node_size - + (parent->num_recs + 1) * 2; } } diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 22e4d4e3299..21023d9f8ff 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -51,7 +51,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -115,7 +116,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->node_size_shift = ffs(size) - 1; - tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + tree->pages_per_bnode = + (tree->node_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; kunmap(page); page_cache_release(page); @@ -144,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + printk(KERN_CRIT "hfs: node %d:%d " + "still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); hfs_bnode_free(node); tree->node_hash_cnt--; } @@ -166,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -272,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); - return hfs_bnode_create(tree, idx); + return hfs_bnode_create(tree, + idx); } } } @@ -287,7 +294,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - printk(KERN_DEBUG "hfs: create new bmap node...\n"); + dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -329,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); + printk(KERN_CRIT "hfs: unable to free bnode %u. " + "bmap not found!\n", + node->this); return; } node = hfs_bnode_find(tree, i); @@ -337,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); + printk(KERN_CRIT "hfs: invalid bmap found! " + "(%u,%d)\n", + node->this, node->type); hfs_bnode_put(node); return; } @@ -350,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); + printk(KERN_CRIT "hfs: trying to free free bnode " + "%u(%d)\n", + node->this, node->type); kunmap(page); hfs_bnode_put(node); return; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 8af45fc5b05..b4ba1b31933 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -91,7 +91,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) perms->dev = 0; } -static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) +static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, + u32 cnid, struct inode *inode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); @@ -128,20 +129,32 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i if (cnid == inode->i_ino) { hfsplus_cat_set_perms(inode, &file->permissions); if (S_ISLNK(inode->i_mode)) { - file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); - file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); + file->user_info.fdType = + cpu_to_be32(HFSP_SYMLINK_TYPE); + file->user_info.fdCreator = + cpu_to_be32(HFSP_SYMLINK_CREATOR); } else { - file->user_info.fdType = cpu_to_be32(sbi->type); - file->user_info.fdCreator = cpu_to_be32(sbi->creator); + file->user_info.fdType = + cpu_to_be32(sbi->type); + file->user_info.fdCreator = + cpu_to_be32(sbi->creator); } - if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) - file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); + if (HFSPLUS_FLG_IMMUTABLE & + (file->permissions.rootflags | + file->permissions.userflags)) + file->flags |= + cpu_to_be16(HFSPLUS_FILE_LOCKED); } else { - file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); - file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); - file->user_info.fdFlags = cpu_to_be16(0x100); - file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date; - file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid); + file->user_info.fdType = + cpu_to_be32(HFSP_HARDLINK_TYPE); + file->user_info.fdCreator = + cpu_to_be32(HFSP_HFSPLUS_CREATOR); + file->user_info.fdFlags = + cpu_to_be16(0x100); + file->create_date = + HFSPLUS_I(sbi->hidden_dir)->create_date; + file->permissions.dev = + cpu_to_be32(HFSPLUS_I(inode)->linkid); } return sizeof(*file); } @@ -182,12 +195,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, return -EIO; } - hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), - &tmp.thread.nodeName); + hfsplus_cat_build_key_uni(fd->search_key, + be32_to_cpu(tmp.thread.parentID), + &tmp.thread.nodeName); return hfs_brec_find(fd); } -int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) +int hfsplus_create_cat(u32 cnid, struct inode *dir, + struct qstr *str, struct inode *inode) { struct super_block *sb = dir->i_sb; struct hfs_find_data fd; @@ -195,13 +210,15 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); + dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? + entry_size = hfsplus_fill_cat_thread(sb, &entry, + S_ISDIR(inode->i_mode) ? HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, - dir->i_ino, str); + dir->i_ino, str); err = hfs_brec_find(&fd); if (err != -ENOENT) { if (!err) @@ -227,7 +244,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino dir->i_size++; dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(dir); + hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + hfs_find_exit(&fd); return 0; @@ -249,7 +267,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) int err, off; u16 type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", + str ? str->name : NULL, cnid); hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (!str) { @@ -260,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (err) goto out; - off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName); + off = fd.entryoffset + + offsetof(struct hfsplus_cat_thread, nodeName); fd.search_key->cat.parent = cpu_to_be32(dir->i_ino); - hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2); + hfs_bnode_read(fd.bnode, + &fd.search_key->cat.name.length, off, 2); len = be16_to_cpu(fd.search_key->cat.name.length) * 2; - hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len); + hfs_bnode_read(fd.bnode, + &fd.search_key->cat.name.unicode, + off + 2, len); fd.search_key->key_len = cpu_to_be16(6 + len); } else hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); @@ -281,7 +304,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA); #endif - off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork); + off = fd.entryoffset + + offsetof(struct hfsplus_cat_file, rsrc_fork); hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); } @@ -308,7 +332,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) dir->i_size--; dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(dir); + hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); out: hfs_find_exit(&fd); @@ -325,7 +349,8 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err = 0; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, + dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); dst_fd = src_fd; @@ -353,7 +378,6 @@ int hfsplus_rename_cat(u32 cnid, goto out; dst_dir->i_size++; dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(dst_dir); /* finally remove the old entry */ hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); @@ -365,7 +389,6 @@ int hfsplus_rename_cat(u32 cnid, goto out; src_dir->i_size--; src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(src_dir); /* remove old thread entry */ hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); @@ -379,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid, /* create new thread entry */ hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); - entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); + entry_size = hfsplus_fill_cat_thread(sb, &entry, type, + dst_dir->i_ino, dst_name); err = hfs_brec_find(&dst_fd); if (err != -ENOENT) { if (!err) @@ -387,6 +411,9 @@ int hfsplus_rename_cat(u32 cnid, goto out; } err = hfs_brec_insert(&dst_fd, &entry, entry_size); + + hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY); + hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY); out: hfs_bnode_put(dst_fd.bnode); hfs_find_exit(&src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 9d59c0571f5..4df5059c25d 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; - dentry->d_op = &hfsplus_dentry_operations; dentry->d_fsdata = NULL; hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); @@ -66,11 +65,17 @@ again: goto fail; } cnid = be32_to_cpu(entry.file.id); - if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && - entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && - (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date || - entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) && - HFSPLUS_SB(sb)->hidden_dir) { + if (entry.file.user_info.fdType == + cpu_to_be32(HFSP_HARDLINK_TYPE) && + entry.file.user_info.fdCreator == + cpu_to_be32(HFSP_HFSPLUS_CREATOR) && + (entry.file.create_date == + HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> + create_date || + entry.file.create_date == + HFSPLUS_I(sb->s_root->d_inode)-> + create_date) && + HFSPLUS_SB(sb)->hidden_dir) { struct qstr str; char name[32]; @@ -83,11 +88,13 @@ again: linkid = 0; } else { dentry->d_fsdata = (void *)(unsigned long)cnid; - linkid = be32_to_cpu(entry.file.permissions.dev); + linkid = + be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; hfsplus_cat_build_key(sb, fd.search_key, - HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); + HFSPLUS_SB(sb)->hidden_dir->i_ino, + &str); goto again; } } else if (!dentry->d_fsdata) @@ -139,7 +146,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) filp->f_pos++; /* fall through */ case 1: - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + fd.entrylength); if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { printk(KERN_ERR "hfs: bad catalog folder thread\n"); err = -EIO; @@ -169,14 +177,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, + fd.entrylength); type = be16_to_cpu(entry.type); len = HFSPLUS_MAX_STRLEN; err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; if (type == HFSPLUS_FOLDER) { - if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { + if (fd.entrylength < + sizeof(struct hfsplus_cat_folder)) { printk(KERN_ERR "hfs: small dir entry\n"); err = -EIO; goto out; @@ -202,7 +212,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - next: +next: filp->f_pos++; if (filp->f_pos >= inode->i_size) goto out; @@ -273,7 +283,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, HFSPLUS_I(inode)->linkid = id; cnid = sbi->next_cnid++; src_dentry->d_fsdata = (void *)(unsigned long)cnid; - res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); + res = hfsplus_create_cat(cnid, src_dir, + &src_dentry->d_name, inode); if (res) /* panic? */ goto out; @@ -485,6 +496,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { }; const struct file_operations hfsplus_dir_operations = { + .fsync = hfsplus_file_fsync, .read = generic_read_dir, .readdir = hfsplus_readdir, .unlocked_ioctl = hfsplus_ioctl, diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 0c9cb1820a5..52a0bcaa7b6 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -83,7 +83,8 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); } -static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) +static void __hfsplus_ext_write_extent(struct inode *inode, + struct hfs_find_data *fd) { struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res; @@ -95,24 +96,32 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); res = hfs_brec_find(fd); - if (hip->flags & HFSPLUS_FLG_EXT_NEW) { + if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) return; hfs_brec_insert(fd, hip->cached_extents, sizeof(hfsplus_extent_rec)); - hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); } else { if (res) return; hfs_bnode_write(fd->bnode, hip->cached_extents, fd->entryoffset, fd->entrylength); - hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY; + hip->extent_state &= ~HFSPLUS_EXT_DIRTY; } + + /* + * We can't just use hfsplus_mark_inode_dirty here, because we + * also get called from hfsplus_write_inode, which should not + * redirty the inode. Instead the callers have to be careful + * to explicily mark the inode dirty, too. + */ + set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); } static void hfsplus_ext_write_extent_locked(struct inode *inode) { - if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) { + if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { struct hfs_find_data fd; hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); @@ -144,18 +153,20 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, return -ENOENT; if (fd->entrylength != sizeof(hfsplus_extent_rec)) return -EIO; - hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec)); + hfs_bnode_read(fd->bnode, extent, fd->entryoffset, + sizeof(hfsplus_extent_rec)); return 0; } -static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) +static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, + struct inode *inode, u32 block) { struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res; WARN_ON(!mutex_is_locked(&hip->extents_lock)); - if (hip->flags & HFSPLUS_FLG_EXT_DIRTY) + if (hip->extent_state & HFSPLUS_EXT_DIRTY) __hfsplus_ext_write_extent(inode, fd); res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, @@ -164,10 +175,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in HFSPLUS_TYPE_DATA); if (!res) { hip->cached_start = be32_to_cpu(fd->key->ext.start_block); - hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents); + hip->cached_blocks = + hfsplus_ext_block_count(hip->cached_extents); } else { hip->cached_start = hip->cached_blocks = 0; - hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); } return res; } @@ -197,6 +209,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res = -EIO; u32 ablock, dblock, mask; + int was_dirty = 0; int shift; /* Convert inode block to disk allocation block */ @@ -223,27 +236,37 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, return -EIO; mutex_lock(&hip->extents_lock); + + /* + * hfsplus_ext_read_extent will write out a cached extent into + * the extents btree. In that case we may have to mark the inode + * dirty even for a pure read of an extent here. + */ + was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY); res = hfsplus_ext_read_extent(inode, ablock); - if (!res) { - dblock = hfsplus_ext_find_block(hip->cached_extents, - ablock - hip->cached_start); - } else { + if (res) { mutex_unlock(&hip->extents_lock); return -EIO; } + dblock = hfsplus_ext_find_block(hip->cached_extents, + ablock - hip->cached_start); mutex_unlock(&hip->extents_lock); done: - dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); + dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", + inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; - map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask)); + map_bh(bh_result, sb, + (dblock << sbi->fs_shift) + sbi->blockoffset + + (iblock & mask)); if (create) { set_buffer_new(bh_result); hip->phys_size += sb->s_blocksize; hip->fs_blocks++; inode_add_bytes(inode, sb->s_blocksize); - mark_inode_dirty(inode); } + if (create || was_dirty) + mark_inode_dirty(inode); return 0; } @@ -326,7 +349,8 @@ found: } } -int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type) +int hfsplus_free_fork(struct super_block *sb, u32 cnid, + struct hfsplus_fork_raw *fork, int type) { struct hfs_find_data fd; hfsplus_extent_rec ext_entry; @@ -373,12 +397,13 @@ int hfsplus_file_extend(struct inode *inode) u32 start, len, goal; int res; - if (sbi->alloc_file->i_size * 8 < - sbi->total_blocks - sbi->free_blocks + 8) { - // extend alloc file - printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", - sbi->alloc_file->i_size * 8, - sbi->total_blocks, sbi->free_blocks); + if (sbi->total_blocks - sbi->free_blocks + 8 > + sbi->alloc_file->i_size * 8) { + /* extend alloc file */ + printk(KERN_ERR "hfs: extend alloc file! " + "(%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } @@ -429,7 +454,7 @@ int hfsplus_file_extend(struct inode *inode) start, len); if (!res) { hfsplus_dump_extent(hip->cached_extents); - hip->flags |= HFSPLUS_FLG_EXT_DIRTY; + hip->extent_state |= HFSPLUS_EXT_DIRTY; hip->cached_blocks += len; } else if (res == -ENOSPC) goto insert_extent; @@ -438,7 +463,7 @@ out: mutex_unlock(&hip->extents_lock); if (!res) { hip->alloc_blocks += len; - mark_inode_dirty(inode); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); } return res; @@ -450,7 +475,7 @@ insert_extent: hip->cached_extents[0].start_block = cpu_to_be32(start); hip->cached_extents[0].block_count = cpu_to_be32(len); hfsplus_dump_extent(hip->cached_extents); - hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; + hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW; hip->cached_start = hip->alloc_blocks; hip->cached_blocks = len; @@ -466,8 +491,9 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", - inode->i_ino, (long long)hip->phys_size, inode->i_size); + dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n", + inode->i_ino, (long long)hip->phys_size, + inode->i_size); if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; @@ -481,7 +507,8 @@ void hfsplus_file_truncate(struct inode *inode) &page, &fsdata); if (res) return; - res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); + res = pagecache_write_end(NULL, mapping, size, + 0, 0, page, fsdata); if (res < 0) return; mark_inode_dirty(inode); @@ -513,12 +540,12 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt - start, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->cached_extents); if (blk_cnt > start) { - hip->flags |= HFSPLUS_FLG_EXT_DIRTY; + hip->extent_state |= HFSPLUS_EXT_DIRTY; break; } alloc_cnt = start; hip->cached_start = hip->cached_blocks = 0; - hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); hfs_brec_remove(&fd); } hfs_find_exit(&fd); @@ -527,7 +554,8 @@ void hfsplus_file_truncate(struct inode *inode) hip->alloc_blocks = blk_cnt; out: hip->phys_size = inode->i_size; - hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); - mark_inode_dirty(inode); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index cb3653efb57..d6857523336 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -23,13 +23,16 @@ #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 -//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) -//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) +#if 0 +#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) +#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) +#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) +#endif #define DBG_MASK (0) #define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) printk(fmt , ## args) + if (flg & DBG_MASK) \ + printk(fmt , ## args) /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ @@ -37,7 +40,8 @@ #define HFSPLUS_TYPE_DATA 0x00 #define HFSPLUS_TYPE_RSRC 0xFF -typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *); +typedef int (*btree_keycmp)(const hfsplus_btree_key *, + const hfsplus_btree_key *); #define NODE_HASH_SIZE 256 @@ -61,7 +65,6 @@ struct hfs_btree { unsigned int max_key_len; unsigned int depth; - //unsigned int map1_size, map_size; struct mutex tree_lock; unsigned int pages_per_bnode; @@ -107,8 +110,8 @@ struct hfsplus_vh; struct hfs_btree; struct hfsplus_sb_info { - struct buffer_head *s_vhbh; struct hfsplus_vh *s_vhdr; + struct hfsplus_vh *s_backup_vhdr; struct hfs_btree *ext_tree; struct hfs_btree *cat_tree; struct hfs_btree *attr_tree; @@ -118,7 +121,8 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; - u32 sect_count; + sector_t part_start; + sector_t sect_count; int fs_shift; /* immutable data from the volume header */ @@ -155,6 +159,12 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_FORCE 2 #define HFSPLUS_SB_HFSX 3 #define HFSPLUS_SB_CASEFOLD 4 +#define HFSPLUS_SB_NOBARRIER 5 + +static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} struct hfsplus_inode_info { @@ -170,7 +180,7 @@ struct hfsplus_inode_info { u32 cached_blocks; hfsplus_extent_rec first_extents; hfsplus_extent_rec cached_extents; - unsigned long flags; + unsigned int extent_state; struct mutex extents_lock; /* @@ -185,6 +195,11 @@ struct hfsplus_inode_info { u32 linkid; /* + * Accessed using atomic bitops. + */ + unsigned long flags; + + /* * Protected by i_mutex. */ sector_t fs_blocks; @@ -195,12 +210,34 @@ struct hfsplus_inode_info { struct inode vfs_inode; }; -#define HFSPLUS_FLG_RSRC 0x0001 -#define HFSPLUS_FLG_EXT_DIRTY 0x0002 -#define HFSPLUS_FLG_EXT_NEW 0x0004 +#define HFSPLUS_EXT_DIRTY 0x0001 +#define HFSPLUS_EXT_NEW 0x0002 + +#define HFSPLUS_I_RSRC 0 /* represents a resource fork */ +#define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */ +#define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */ +#define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */ + +#define HFSPLUS_IS_RSRC(inode) \ + test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags) + +static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) +{ + return list_entry(inode, struct hfsplus_inode_info, vfs_inode); +} -#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)) -#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC) +/* + * Mark an inode dirty, and also mark the btree in which the + * specific type of metadata is stored. + * For data or metadata that gets written back by into the catalog btree + * by hfsplus_write_inode a plain mark_inode_dirty call is enough. + */ +static inline void hfsplus_mark_inode_dirty(struct inode *inode, + unsigned int flag) +{ + set_bit(flag, &HFSPLUS_I(inode)->flags); + mark_inode_dirty(inode); +} struct hfs_find_data { /* filled by caller */ @@ -318,9 +355,12 @@ int hfs_brec_read(struct hfs_find_data *, void *, int); int hfs_brec_goto(struct hfs_find_data *, int); /* catalog.c */ -int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); -int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *); +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, + const hfsplus_btree_key *); +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, + const hfsplus_btree_key *); +void hfsplus_cat_build_key(struct super_block *sb, + hfsplus_btree_key *, u32, struct qstr *); int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); int hfsplus_delete_cat(u32, struct inode *, struct qstr *); @@ -336,7 +376,8 @@ extern const struct file_operations hfsplus_dir_operations; int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); void hfsplus_ext_write_extent(struct inode *); int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); -int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int); +int hfsplus_free_fork(struct super_block *, u32, + struct hfsplus_fork_raw *, int); int hfsplus_file_extend(struct inode *); void hfsplus_file_truncate(struct inode *); @@ -351,6 +392,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *); int hfsplus_cat_write_inode(struct inode *); struct inode *hfsplus_new_inode(struct super_block *, int); void hfsplus_delete_inode(struct inode *); +int hfsplus_file_fsync(struct file *file, int datasync); /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); @@ -362,6 +404,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); /* options.c */ int hfsplus_parse_options(char *, struct hfsplus_sb_info *); +int hfsplus_parse_options_remount(char *input, int *force); void hfsplus_fill_defaults(struct hfsplus_sb_info *); int hfsplus_show_options(struct seq_file *, struct vfsmount *); @@ -375,45 +418,26 @@ extern u16 hfsplus_decompose_table[]; extern u16 hfsplus_compose_table[]; /* unicode.c */ -int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); -int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); -int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); -int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); -int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str); -int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2); +int hfsplus_strcasecmp(const struct hfsplus_unistr *, + const struct hfsplus_unistr *); +int hfsplus_strcmp(const struct hfsplus_unistr *, + const struct hfsplus_unistr *); +int hfsplus_uni2asc(struct super_block *, + const struct hfsplus_unistr *, char *, int *); +int hfsplus_asc2uni(struct super_block *, + struct hfsplus_unistr *, const char *, int); +int hfsplus_hash_dentry(const struct dentry *dentry, + const struct inode *inode, struct qstr *str); +int hfsplus_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); /* wrapper.c */ int hfsplus_read_wrapper(struct super_block *); - int hfs_part_find(struct super_block *, sector_t *, sector_t *); - -/* access macros */ -static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - -static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) -{ - return list_entry(inode, struct hfsplus_inode_info, vfs_inode); -} - -#define sb_bread512(sb, sec, data) ({ \ - struct buffer_head *__bh; \ - sector_t __block; \ - loff_t __start; \ - int __offset; \ - \ - __start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\ - __block = __start >> (sb)->s_blocksize_bits; \ - __offset = __start & ((sb)->s_blocksize - 1); \ - __bh = sb_bread((sb), __block); \ - if (likely(__bh != NULL)) \ - data = (void *)(__bh->b_data + __offset);\ - else \ - data = NULL; \ - __bh; \ -}) +int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, + void *data, int rw); /* time macros */ #define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 6892899fd6f..927cdd6d5bf 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -36,7 +36,8 @@ #define HFSP_WRAPOFF_EMBEDSIG 0x7C #define HFSP_WRAPOFF_EMBEDEXT 0x7E -#define HFSP_HIDDENDIR_NAME "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" +#define HFSP_HIDDENDIR_NAME \ + "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" #define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ #define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 8afd7e84f98..a8df651747f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -8,6 +8,7 @@ * Inode handling routines */ +#include <linux/blkdev.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -77,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) if (!tree) return 0; if (tree->node_size >= PAGE_CACHE_SIZE) { - nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); + nidx = page->index >> + (tree->node_size_shift - PAGE_CACHE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); if (!node) @@ -90,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) } spin_unlock(&tree->hash_lock); } else { - nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); + nidx = page->index << + (PAGE_CACHE_SHIFT - tree->node_size_shift); i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); spin_lock(&tree->hash_lock); do { @@ -166,8 +169,8 @@ const struct dentry_operations hfsplus_dentry_operations = { .d_compare = hfsplus_compare_dentry, }; -static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) +static struct dentry *hfsplus_file_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) { struct hfs_find_data fd; struct super_block *sb = dir->i_sb; @@ -190,7 +193,9 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent inode->i_ino = dir->i_ino; INIT_LIST_HEAD(&hip->open_dir_list); mutex_init(&hip->extents_lock); - hip->flags = HFSPLUS_FLG_RSRC; + hip->extent_state = 0; + hip->flags = 0; + set_bit(HFSPLUS_I_RSRC, &hip->flags); hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); err = hfsplus_find_cat(sb, dir->i_ino, &fd); @@ -219,7 +224,8 @@ out: return NULL; } -static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) +static void hfsplus_get_perms(struct inode *inode, + struct hfsplus_perm *perms, int dir) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); u16 mode; @@ -302,29 +308,41 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static int hfsplus_file_fsync(struct file *filp, int datasync) +int hfsplus_file_fsync(struct file *file, int datasync) { - struct inode *inode = filp->f_mapping->host; - struct super_block * sb; - int ret, err; - - /* sync the inode to buffers */ - ret = write_inode_now(inode, 0); - - /* sync the superblock to buffers */ - sb = inode->i_sb; - if (sb->s_dirt) { - if (!(sb->s_flags & MS_RDONLY)) - hfsplus_sync_fs(sb, 1); - else - sb->s_dirt = 0; + struct inode *inode = file->f_mapping->host; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + int error = 0, error2; + + /* + * Sync inode metadata into the catalog and extent trees. + */ + sync_inode_metadata(inode, 1); + + /* + * And explicitly write out the btrees. + */ + if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags)) + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + + if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) { + error2 = + filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; } - /* .. finally sync the buffers to disk */ - err = sync_blockdev(sb->s_bdev); - if (!ret) - ret = err; - return ret; + if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + } + + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + return error; } static const struct inode_operations hfsplus_file_inode_operations = { @@ -337,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = { }; static const struct file_operations hfsplus_file_operations = { - .llseek = generic_file_llseek, + .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, @@ -370,6 +388,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) INIT_LIST_HEAD(&hip->open_dir_list); mutex_init(&hip->extents_lock); atomic_set(&hip->opencnt, 0); + hip->extent_state = 0; hip->flags = 0; memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); @@ -457,7 +476,8 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) } } -void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) +void hfsplus_inode_write_fork(struct inode *inode, + struct hfsplus_fork_raw *fork) { memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, sizeof(hfsplus_extent_rec)); @@ -499,13 +519,14 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_file)); - hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ? - &file->data_fork : &file->rsrc_fork); + hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? + &file->rsrc_fork : &file->data_fork); hfsplus_get_perms(inode, &file->permissions, 0); inode->i_nlink = 1; if (S_ISREG(inode->i_mode)) { if (file->permissions.dev) - inode->i_nlink = be32_to_cpu(file->permissions.dev); + inode->i_nlink = + be32_to_cpu(file->permissions.dev); inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; @@ -578,7 +599,9 @@ int hfsplus_cat_write_inode(struct inode *inode) sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); hfsplus_cat_set_perms(inode, &file->permissions); - if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) + if (HFSPLUS_FLG_IMMUTABLE & + (file->permissions.rootflags | + file->permissions.userflags)) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); else file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); @@ -588,6 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode) hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); } + + set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); out: hfs_find_exit(&fd); return 0; diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 40a85a3ded6..508ce662ce1 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -28,7 +28,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) if (inode->i_flags & S_IMMUTABLE) flags |= FS_IMMUTABLE_FL; - if (inode->i_flags |= S_APPEND) + if (inode->i_flags & S_APPEND) flags |= FS_APPEND_FL; if (hip->userflags & HFSPLUS_FLG_NODUMP) flags |= FS_NODUMP_FL; @@ -147,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, res = -ERANGE; } else res = -EOPNOTSUPP; - if (!res) + if (!res) { hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } out: hfs_find_exit(&fd); return res; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index f9ab276a4d8..bb62a588214 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -23,6 +23,7 @@ enum { opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, opt_nodecompose, opt_decompose, + opt_barrier, opt_nobarrier, opt_force, opt_err }; @@ -37,6 +38,8 @@ static const match_table_t tokens = { { opt_nls, "nls=%s" }, { opt_decompose, "decompose" }, { opt_nodecompose, "nodecompose" }, + { opt_barrier, "barrier" }, + { opt_nobarrier, "nobarrier" }, { opt_force, "force" }, { opt_err, NULL } }; @@ -65,6 +68,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result) return 0; } +int hfsplus_parse_options_remount(char *input, int *force) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!input) + return 0; + + while ((p = strsep(&input, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case opt_force: + *force = 1; + break; + default: + break; + } + } + + return 1; +} + /* Parse options from mount. Returns 0 on failure */ /* input is the options passed to mount() as a string */ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) @@ -136,7 +165,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (p) sbi->nls = load_nls(p); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p); + printk(KERN_ERR "hfs: unable to load " + "nls mapping \"%s\"\n", + p); kfree(p); return 0; } @@ -148,6 +179,12 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) case opt_nodecompose: set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; + case opt_barrier: + clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_nobarrier: + set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; case opt_force: set_bit(HFSPLUS_SB_FORCE, &sbi->flags); break; @@ -177,7 +214,8 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); if (sbi->type != HFSPLUS_DEF_CR_TYPE) seq_printf(seq, ",type=%.4s", (char *)&sbi->type); - seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid); + seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, + sbi->uid, sbi->gid); if (sbi->part >= 0) seq_printf(seq, ",part=%u", sbi->part); if (sbi->session >= 0) @@ -186,5 +224,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) seq_printf(seq, ",nls=%s", sbi->nls->charset); if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) seq_printf(seq, ",nodecompose"); + if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + seq_printf(seq, ",nobarrier"); return 0; } diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 208b16c645c..d66ad113b1c 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -2,7 +2,8 @@ * linux/fs/hfsplus/part_tbl.c * * Copyright (C) 1996-1997 Paul H. Hargrove - * This file may be distributed under the terms of the GNU General Public License. + * This file may be distributed under the terms of + * the GNU General Public License. * * Original code to handle the new style Mac partition table based on * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). @@ -13,6 +14,7 @@ * */ +#include <linux/slab.h> #include "hfsplus_fs.h" /* offsets to various blocks */ @@ -58,77 +60,94 @@ struct new_pmap { */ struct old_pmap { __be16 pdSig; /* Signature bytes */ - struct old_pmap_entry { + struct old_pmap_entry { __be32 pdStart; __be32 pdSize; __be32 pdFSID; } pdEntry[42]; } __packed; +static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, + sector_t *part_start, sector_t *part_size) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int i; + + for (i = 0; i < 42; i++) { + struct old_pmap_entry *p = &pm->pdEntry[i]; + + if (p->pdStart && p->pdSize && + p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && + (sbi->part < 0 || sbi->part == i)) { + *part_start += be32_to_cpu(p->pdStart); + *part_size = be32_to_cpu(p->pdSize); + return 0; + } + } + + return -ENOENT; +} + +static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, + sector_t *part_start, sector_t *part_size) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int size = be32_to_cpu(pm->pmMapBlkCnt); + int res; + int i = 0; + + do { + if (!memcmp(pm->pmPartType, "Apple_HFS", 9) && + (sbi->part < 0 || sbi->part == i)) { + *part_start += be32_to_cpu(pm->pmPyPartStart); + *part_size = be32_to_cpu(pm->pmPartBlkCnt); + return 0; + } + + if (++i >= size) + return -ENOENT; + + res = hfsplus_submit_bio(sb->s_bdev, + *part_start + HFS_PMAP_BLK + i, + pm, READ); + if (res) + return res; + } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); + + return -ENOENT; +} + /* - * hfs_part_find() - * - * Parse the partition map looking for the - * start and length of the 'part'th HFS partition. + * Parse the partition map looking for the start and length of a + * HFS/HFS+ partition. */ int hfs_part_find(struct super_block *sb, - sector_t *part_start, sector_t *part_size) + sector_t *part_start, sector_t *part_size) { - struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - struct buffer_head *bh; - __be16 *data; - int i, size, res; + void *data; + int res; + + data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; - res = -ENOENT; - bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data); - if (!bh) - return -EIO; + res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, + data, READ); + if (res) + return res; - switch (be16_to_cpu(*data)) { + switch (be16_to_cpu(*((__be16 *)data))) { case HFS_OLD_PMAP_MAGIC: - { - struct old_pmap *pm; - struct old_pmap_entry *p; - - pm = (struct old_pmap *)bh->b_data; - p = pm->pdEntry; - size = 42; - for (i = 0; i < size; p++, i++) { - if (p->pdStart && p->pdSize && - p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && - (sbi->part < 0 || sbi->part == i)) { - *part_start += be32_to_cpu(p->pdStart); - *part_size = be32_to_cpu(p->pdSize); - res = 0; - } - } + res = hfs_parse_old_pmap(sb, data, part_start, part_size); break; - } case HFS_NEW_PMAP_MAGIC: - { - struct new_pmap *pm; - - pm = (struct new_pmap *)bh->b_data; - size = be32_to_cpu(pm->pmMapBlkCnt); - for (i = 0; i < size;) { - if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && - (sbi->part < 0 || sbi->part == i)) { - *part_start += be32_to_cpu(pm->pmPyPartStart); - *part_size = be32_to_cpu(pm->pmPartBlkCnt); - res = 0; - break; - } - brelse(bh); - bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm); - if (!bh) - return -EIO; - if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC)) - break; - } + res = hfs_parse_new_pmap(sb, data, part_start, part_size); + break; + default: + res = -ENOENT; break; - } } - brelse(bh); + kfree(data); return res; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 52cc746d3ba..9a3b4795f43 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/pagemap.h> +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/vfs.h> @@ -66,6 +67,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); mutex_init(&HFSPLUS_I(inode)->extents_lock); HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->extent_state = 0; HFSPLUS_I(inode)->rsrc_inode = NULL; atomic_set(&HFSPLUS_I(inode)->opencnt, 0); @@ -157,45 +159,65 @@ int hfsplus_sync_fs(struct super_block *sb, int wait) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_vh *vhdr = sbi->s_vhdr; + int write_backup = 0; + int error, error2; + + if (!wait) + return 0; dprint(DBG_SUPER, "hfsplus_write_super\n"); - mutex_lock(&sbi->vh_mutex); - mutex_lock(&sbi->alloc_mutex); sb->s_dirt = 0; + /* + * Explicitly write out the special metadata inodes. + * + * While these special inodes are marked as hashed and written + * out peridocically by the flusher threads we redirty them + * during writeout of normal inodes, and thus the life lock + * prevents us from getting the latest state to disk. + */ + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + + mutex_lock(&sbi->vh_mutex); + mutex_lock(&sbi->alloc_mutex); vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); - mark_buffer_dirty(sbi->s_vhbh); if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { - if (sbi->sect_count) { - struct buffer_head *bh; - u32 block, offset; - - block = sbi->blockoffset; - block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9); - offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1); - printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", - sbi->blockoffset, sbi->sect_count, - block, offset); - bh = sb_bread(sb, block); - if (bh) { - vhdr = (struct hfsplus_vh *)(bh->b_data + offset); - if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { - memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr)); - mark_buffer_dirty(bh); - brelse(bh); - } else - printk(KERN_WARNING "hfs: backup not found!\n"); - } - } + memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); + write_backup = 1; } + + error2 = hfsplus_submit_bio(sb->s_bdev, + sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr, WRITE_SYNC); + if (!error) + error = error2; + if (!write_backup) + goto out; + + error2 = hfsplus_submit_bio(sb->s_bdev, + sbi->part_start + sbi->sect_count - 2, + sbi->s_backup_vhdr, WRITE_SYNC); + if (!error) + error2 = error; +out: mutex_unlock(&sbi->alloc_mutex); mutex_unlock(&sbi->vh_mutex); - return 0; + + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + + return error; } static void hfsplus_write_super(struct super_block *sb) @@ -215,23 +237,22 @@ static void hfsplus_put_super(struct super_block *sb) if (!sb->s_fs_info) return; - if (sb->s_dirt) - hfsplus_write_super(sb); if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { struct hfsplus_vh *vhdr = sbi->s_vhdr; vhdr->modify_date = hfsp_now2mt(); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); - mark_buffer_dirty(sbi->s_vhbh); - sync_dirty_buffer(sbi->s_vhbh); + + hfsplus_sync_fs(sb, 1); } hfs_btree_close(sbi->cat_tree); hfs_btree_close(sbi->ext_tree); iput(sbi->alloc_file); iput(sbi->hidden_dir); - brelse(sbi->s_vhbh); + kfree(sbi->s_vhdr); + kfree(sbi->s_backup_vhdr); unload_nls(sbi->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; @@ -263,26 +284,31 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) return 0; if (!(*flags & MS_RDONLY)) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; - struct hfsplus_sb_info sbi; + int force = 0; - memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); - sbi.nls = HFSPLUS_SB(sb)->nls; - if (!hfsplus_parse_options(data, &sbi)) + if (!hfsplus_parse_options_remount(data, &force)) return -EINVAL; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfsplus is recommended. leaving read-only.\n"); + printk(KERN_WARNING "hfs: filesystem was " + "not cleanly unmounted, " + "running fsck.hfsplus is recommended. " + "leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; - } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) { + } else if (force) { /* nothing */ - } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); + } else if (vhdr->attributes & + cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { + printk(KERN_WARNING "hfs: filesystem is marked locked, " + "leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; - } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { - printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n"); + } else if (vhdr->attributes & + cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + printk(KERN_WARNING "hfs: filesystem is " + "marked journaled, " + "leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -372,17 +398,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " - "running fsck.hfsplus is recommended. mounting read-only.\n"); + printk(KERN_WARNING "hfs: Filesystem was " + "not cleanly unmounted, " + "running fsck.hfsplus is recommended. " + "mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; - } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " - "use the force option at your own risk, mounting read-only.\n"); + } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING "hfs: write access to " + "a journaled filesystem is not supported, " + "use the force option at your own risk, " + "mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } @@ -413,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(root); goto cleanup; } + sb->s_d_op = &hfsplus_dentry_operations; sb->s_root = d_alloc_root(root); if (!sb->s_root) { iput(root); err = -ENOMEM; goto cleanup; } - sb->s_root->d_op = &hfsplus_dentry_operations; str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; @@ -449,19 +480,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) be32_add_cpu(&vhdr->write_count, 1); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); - mark_buffer_dirty(sbi->s_vhbh); - sync_dirty_buffer(sbi->s_vhbh); + hfsplus_sync_fs(sb, 1); if (!sbi->hidden_dir) { - printk(KERN_DEBUG "hfs: create hidden dir...\n"); - mutex_lock(&sbi->vh_mutex); sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, &str, sbi->hidden_dir); mutex_unlock(&sbi->vh_mutex); - mark_inode_dirty(sbi->hidden_dir); + hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); } out: unload_nls(sbi->nls); @@ -488,11 +516,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) return i ? &i->vfs_inode : NULL; } -static void hfsplus_destroy_inode(struct inode *inode) +static void hfsplus_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } +static void hfsplus_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hfsplus_i_callback); +} + #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) static struct dentry *hfsplus_mount(struct file_system_type *fs_type, diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index b66d67de882..a3f0bfcc881 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -17,14 +17,14 @@ /* Returns folded char, or 0 if ignorable */ static inline u16 case_fold(u16 c) { - u16 tmp; - - tmp = hfsplus_case_fold_table[c >> 8]; - if (tmp) - tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; - else - tmp = c; - return tmp; + u16 tmp; + + tmp = hfsplus_case_fold_table[c >> 8]; + if (tmp) + tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; + else + tmp = c; + return tmp; } /* Compare unicode strings, return values like normal strcmp */ @@ -118,7 +118,9 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) return NULL; } -int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) +int hfsplus_uni2asc(struct super_block *sb, + const struct hfsplus_unistr *ustr, + char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; @@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c goto same; c1 = be16_to_cpu(*ip); if (likely(compose)) - ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1); + ce1 = hfsplus_compose_lookup( + hfsplus_compose_table, c1); if (ce1) break; switch (c0) { @@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c if (ce2) { i = 1; while (i < ustrlen) { - ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i])); + ce1 = hfsplus_compose_lookup(ce2, + be16_to_cpu(ip[i])); if (!ce1) break; i++; @@ -211,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c goto done; } } - same: +same: switch (c0) { case 0: cc = 0x2400; @@ -222,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c default: cc = c0; } - done: +done: res = nls->uni2char(cc, op, len); if (res < 0) { if (res == -ENAMETOOLONG) @@ -320,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) +int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *str) { struct super_block *sb = dentry->d_sb; const char *astr; @@ -363,9 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) +int hfsplus_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct super_block *sb = dentry->d_sb; + struct super_block *sb = parent->d_sb; int casefold, decompose, size; int dsize1, dsize2, len1, len2; const u16 *dstr1, *dstr2; @@ -375,10 +383,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); - astr1 = s1->name; - len1 = s1->len; - astr2 = s2->name; - len2 = s2->len; + astr1 = str; + len1 = len; + astr2 = name->name; + len2 = name->len; dsize1 = dsize2 = 0; dstr1 = dstr2 = NULL; @@ -388,7 +396,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * astr1 += size; len1 -= size; - if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) { + if (decompose) + dstr1 = decompose_unichar(c, &dsize1); + if (!decompose || !dstr1) { c1 = c; dstr1 = &c1; dsize1 = 1; @@ -400,7 +410,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * astr2 += size; len2 -= size; - if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) { + if (decompose) + dstr2 = decompose_unichar(c, &dsize2); + if (!decompose || !dstr2) { c2 = c; dstr2 = &c2; dsize2 = 1; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 8972c20b321..196231794f6 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -24,6 +24,40 @@ struct hfsplus_wd { u16 embed_count; }; +static void hfsplus_end_io_sync(struct bio *bio, int err) +{ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + complete(bio->bi_private); +} + +int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, + void *data, int rw) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, 1); + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = hfsplus_end_io_sync; + bio->bi_private = &wait; + + /* + * We always submit one sector at a time, so bio_add_page must not fail. + */ + if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE, + offset_in_page(data)) != HFSPLUS_SECTOR_SIZE) + BUG(); + + submit_bio(rw, bio); + wait_for_completion(&wait); + + if (!bio_flagged(bio, BIO_UPTODATE)) + return -EIO; + return 0; +} + static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) { u32 extent; @@ -40,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) !(attrib & HFSP_WRAP_ATTRIB_SPARED)) return 0; - wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); + wd->ablk_size = + be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); if (wd->ablk_size < HFSPLUS_SECTOR_SIZE) return 0; if (wd->ablk_size % HFSPLUS_SECTOR_SIZE) return 0; - wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); + wd->ablk_start = + be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); wd->embed_start = (extent >> 16) & 0xFFFF; @@ -68,7 +104,8 @@ static int hfsplus_get_last_session(struct super_block *sb, if (HFSPLUS_SB(sb)->session >= 0) { te.cdte_track = HFSPLUS_SB(sb)->session; te.cdte_format = CDROM_LBA; - res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); + res = ioctl_by_bdev(sb->s_bdev, + CDROMREADTOCENTRY, (unsigned long)&te); if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { *start = (sector_t)te.cdte_addr.lba << 2; return 0; @@ -77,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb, return -EINVAL; } ms_info.addr_format = CDROM_LBA; - res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info); + res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, + (unsigned long)&ms_info); if (!res && ms_info.xa_flag) *start = (sector_t)ms_info.addr.lba << 2; return 0; @@ -88,100 +126,112 @@ static int hfsplus_get_last_session(struct super_block *sb, int hfsplus_read_wrapper(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - struct buffer_head *bh; - struct hfsplus_vh *vhdr; struct hfsplus_wd wd; sector_t part_start, part_size; u32 blocksize; + int error = 0; + error = -EINVAL; blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE); if (!blocksize) - return -EINVAL; + goto out; if (hfsplus_get_last_session(sb, &part_start, &part_size)) - return -EINVAL; + goto out; if ((u64)part_start + part_size > 0x100000000ULL) { pr_err("hfs: volumes larger than 2TB are not supported yet\n"); - return -EINVAL; + goto out; } - while (1) { - bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); - if (!bh) - return -EIO; - - if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) { - if (!hfsplus_read_mdb(vhdr, &wd)) - goto error; - wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; - part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; - part_size = wd.embed_count * wd.ablk_size; - brelse(bh); - bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); - if (!bh) - return -EIO; - } - if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) - break; - if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { - set_bit(HFSPLUS_SB_HFSX, &sbi->flags); - break; - } - brelse(bh); - /* check for a partition block + error = -ENOMEM; + sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); + if (!sbi->s_vhdr) + goto out; + sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); + if (!sbi->s_backup_vhdr) + goto out_free_vhdr; + +reread: + error = hfsplus_submit_bio(sb->s_bdev, + part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr, READ); + if (error) + goto out_free_backup_vhdr; + + error = -EINVAL; + switch (sbi->s_vhdr->signature) { + case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX): + set_bit(HFSPLUS_SB_HFSX, &sbi->flags); + /*FALLTHRU*/ + case cpu_to_be16(HFSPLUS_VOLHEAD_SIG): + break; + case cpu_to_be16(HFSP_WRAP_MAGIC): + if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) + goto out; + wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; + part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; + part_size = wd.embed_count * wd.ablk_size; + goto reread; + default: + /* + * Check for a partition block. + * * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) - return -EINVAL; + goto out; + goto reread; + } + + error = hfsplus_submit_bio(sb->s_bdev, + part_start + part_size - 2, + sbi->s_backup_vhdr, READ); + if (error) + goto out_free_backup_vhdr; + + error = -EINVAL; + if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { + printk(KERN_WARNING + "hfs: invalid secondary volume header\n"); + goto out_free_backup_vhdr; } - blocksize = be32_to_cpu(vhdr->blocksize); - brelse(bh); + blocksize = be32_to_cpu(sbi->s_vhdr->blocksize); - /* block size must be at least as large as a sector - * and a multiple of 2 + /* + * Block size must be at least as large as a sector and a multiple of 2. */ - if (blocksize < HFSPLUS_SECTOR_SIZE || - ((blocksize - 1) & blocksize)) - return -EINVAL; + if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) + goto out_free_backup_vhdr; sbi->alloc_blksz = blocksize; sbi->alloc_blksz_shift = 0; while ((blocksize >>= 1) != 0) sbi->alloc_blksz_shift++; blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); - /* align block size to block offset */ + /* + * Align block size to block offset. + */ while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) blocksize >>= 1; if (sb_set_blocksize(sb, blocksize) != blocksize) { - printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize); - return -EINVAL; + printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", + blocksize); + goto out_free_backup_vhdr; } sbi->blockoffset = part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); + sbi->part_start = part_start; sbi->sect_count = part_size; sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; - - bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); - if (!bh) - return -EIO; - - /* should still be the same... */ - if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { - if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) - goto error; - } else { - if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) - goto error; - } - - sbi->s_vhbh = bh; - sbi->s_vhdr = vhdr; - return 0; - error: - brelse(bh); - return -EINVAL; + +out_free_backup_vhdr: + kfree(sbi->s_backup_vhdr); +out_free_vhdr: + kfree(sbi->s_vhdr); +out: + return error; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2c0f148a49e..2638c834ed2 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) -static int hostfs_d_delete(struct dentry *dentry) +static int hostfs_d_delete(const struct dentry *dentry) { return 1; } @@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args, static char *__dentry_name(struct dentry *dentry, char *name) { - char *p = __dentry_path(dentry, name, PATH_MAX); + char *p = dentry_path_raw(dentry, name, PATH_MAX); char *root; size_t len; - spin_unlock(&dcache_lock); - root = dentry->d_sb->s_fs_info; len = strlen(root); if (IS_ERR(p)) { @@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry) if (!name) return NULL; - spin_lock(&dcache_lock); return __dentry_name(dentry, name); /* will unlock */ } static char *inode_name(struct inode *ino) { struct dentry *dentry; - char *name = __getname(); - if (!name) - return NULL; + char *name; - spin_lock(&dcache_lock); - if (list_empty(&ino->i_dentry)) { - spin_unlock(&dcache_lock); - __putname(name); + dentry = d_find_alias(ino); + if (!dentry) return NULL; - } - dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias); - return __dentry_name(dentry, name); /* will unlock */ + + name = dentry_name(dentry); + + dput(dentry); + + return name; } static char *follow_link(char *link) @@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode) } } -static void hostfs_destroy_inode(struct inode *inode) +static void hostfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kfree(HOSTFS_I(inode)); } +static void hostfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hostfs_i_callback); +} + static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) { const char *root_path = vfs->mnt_sb->s_fs_info; @@ -609,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, goto out_put; d_add(dentry, inode); - dentry->d_op = &hostfs_dentry_ops; return NULL; out_put: @@ -746,11 +748,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, return err; } -int hostfs_permission(struct inode *ino, int desired) +int hostfs_permission(struct inode *ino, int desired, unsigned int flags) { char *name; int r = 0, w = 0, x = 0, err; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + if (desired & MAY_READ) r = 1; if (desired & MAY_WRITE) w = 1; if (desired & MAY_EXEC) x = 1; @@ -765,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired) err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(ino, desired, NULL); + err = generic_permission(ino, desired, flags, NULL); return err; } @@ -916,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; + sb->s_d_op = &hostfs_dentry_ops; sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as <NULL> by sprintf: avoid that. */ diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index 67d9d36b3d5..05d4816e4e7 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -12,7 +12,8 @@ * Note: the dentry argument is the parent dentry. */ -static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) +static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { unsigned long hash; int i; @@ -34,29 +35,30 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) return 0; } -static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +static int hpfs_compare_dentry(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - unsigned al=a->len; - unsigned bl=b->len; - hpfs_adjust_length(a->name, &al); + unsigned al = len; + unsigned bl = name->len; + + hpfs_adjust_length(str, &al); /*hpfs_adjust_length(b->name, &bl);*/ - /* 'a' is the qstr of an already existing dentry, so the name - * must be valid. 'b' must be validated first. + + /* + * 'str' is the nane of an already existing dentry, so the name + * must be valid. 'name' must be validated first. */ - if (hpfs_chk_name(b->name, &bl)) + if (hpfs_chk_name(name->name, &bl)) return 1; - if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0)) + if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0)) return 1; return 0; } -static const struct dentry_operations hpfs_dentry_operations = { +const struct dentry_operations hpfs_dentry_operations = { .d_hash = hpfs_hash_dentry, .d_compare = hpfs_compare_dentry, }; - -void hpfs_set_dentry_operations(struct dentry *dentry) -{ - dentry->d_op = &hpfs_dentry_operations; -} diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 2338130cceb..d32f63a569f 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name end: end_add: - hpfs_set_dentry_operations(dentry); unlock_kernel(); d_add(dentry, result); return NULL; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 2fee17d0d9a..1c43dbea55e 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *); /* dentry.c */ -void hpfs_set_dentry_operations(struct dentry *); +extern const struct dentry_operations hpfs_dentry_operations; /* dir.c */ diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 56f0da1cfd1..1ae35baa539 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { error = vmtruncate(inode, attr->ia_size); if (error) - return error; + goto out_unlock; } setattr_copy(inode, attr); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 11c2b4080f6..f4ad9e31ddc 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -419,7 +419,7 @@ again: unlock_kernel(); return -ENOSPC; } - if (generic_permission(inode, MAY_WRITE, NULL) || + if (generic_permission(inode, MAY_WRITE, 0, NULL) || !S_ISREG(inode->i_mode) || get_write_access(inode)) { d_rehash(dentry); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 6c5f01597c3..b30426b1fc9 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void hpfs_destroy_inode(struct inode *inode) +static void hpfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } +static void hpfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hpfs_i_callback); +} + static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; @@ -543,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; s->s_op = &hpfs_sops; + s->s_d_op = &hpfs_dentry_operations; sbi->sb_root = superblock->root; sbi->sb_fs_size = superblock->n_sectors; @@ -644,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) iput(root); goto bail0; } - hpfs_set_dentry_operations(s->s_root); /* * find the root directory's . pointer & finish filling in the inode diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index f702b5f713f..87ed48e0343 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino) mntput(ino->i_sb->s_fs_info); } -static void hppfs_destroy_inode(struct inode *inode) +static void hppfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kfree(HPPFS_I(inode)); } +static void hppfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, hppfs_i_callback); +} + static const struct super_operations hppfs_sbops = { .alloc_inode = hppfs_alloc_inode, .destroy_inode = hppfs_destroy_inode, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a5fe68189ee..9885082b470 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) return &p->vfs_inode; } +static void hugetlbfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); +} + static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); - kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); + call_rcu(&inode->i_rcu, hugetlbfs_i_callback); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/fs/inode.c b/fs/inode.c index ae2727ab0c3..da85e56378f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -102,26 +102,29 @@ static DECLARE_RWSEM(iprune_sem); */ struct inodes_stat_t inodes_stat; -static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; -static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp; +static DEFINE_PER_CPU(unsigned int, nr_inodes); static struct kmem_cache *inode_cachep __read_mostly; -static inline int get_nr_inodes(void) +static int get_nr_inodes(void) { - return percpu_counter_sum_positive(&nr_inodes); + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes, i); + return sum < 0 ? 0 : sum; } static inline int get_nr_inodes_unused(void) { - return percpu_counter_sum_positive(&nr_inodes_unused); + return inodes_stat.nr_unused; } int get_nr_dirty_inodes(void) { + /* not actually dirty inodes, but a wild approximation */ int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; - } /* @@ -132,7 +135,6 @@ int proc_nr_inodes(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); - inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_dointvec(table, write, buffer, lenp, ppos); } #endif @@ -224,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - percpu_counter_inc(&nr_inodes); + this_cpu_inc(nr_inodes); return 0; out: @@ -255,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb) return inode; } +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); @@ -266,10 +274,17 @@ void __destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif - percpu_counter_dec(&nr_inodes); + this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(inode_cachep, inode); +} + static void destroy_inode(struct inode *inode) { BUG_ON(!list_empty(&inode->i_lru)); @@ -277,7 +292,7 @@ static void destroy_inode(struct inode *inode) if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else - kmem_cache_free(inode_cachep, (inode)); + call_rcu(&inode->i_rcu, i_callback); } /* @@ -335,7 +350,7 @@ static void inode_lru_list_add(struct inode *inode) { if (list_empty(&inode->i_lru)) { list_add(&inode->i_lru, &inode_lru); - percpu_counter_inc(&nr_inodes_unused); + inodes_stat.nr_unused++; } } @@ -343,7 +358,7 @@ static void inode_lru_list_del(struct inode *inode) { if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } } @@ -430,6 +445,7 @@ void end_writeback(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); + /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(end_writeback); @@ -513,7 +529,7 @@ void evict_inodes(struct super_block *sb) list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -554,7 +570,7 @@ int invalidate_inodes(struct super_block *sb) list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -616,7 +632,7 @@ static void prune_icache(int nr_to_scan) if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { list_del_init(&inode->i_lru); - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; continue; } @@ -650,7 +666,7 @@ static void prune_icache(int nr_to_scan) */ list_move(&inode->i_lru, &freeable); list_del_init(&inode->i_wb_list); - percpu_counter_dec(&nr_inodes_unused); + inodes_stat.nr_unused--; } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); @@ -1648,8 +1664,6 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); - percpu_counter_init(&nr_inodes, 0); - percpu_counter_init(&nr_inodes_unused, 0); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/internal.h b/fs/internal.h index e43b9a4dbf4..0663568b124 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -63,12 +63,17 @@ extern int copy_mount_string(const void __user *, char **); extern void free_vfsmnt(struct vfsmount *); extern struct vfsmount *alloc_vfsmnt(const char *); +extern unsigned int mnt_get_count(struct vfsmount *mnt); extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, struct vfsmount *); extern void release_mounts(struct list_head *); extern void umount_tree(struct vfsmount *, int, struct list_head *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); +extern int finish_automount(struct vfsmount *, struct path *); + +extern void mnt_make_longterm(struct vfsmount *); +extern void mnt_make_shortterm(struct vfsmount *); extern void __init mnt_init(void); diff --git a/fs/ioctl.c b/fs/ioctl.c index d6cc1647662..a59635e295f 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; - struct fiemap_extent *dest = fieinfo->fi_extents_start; + struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { @@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb, static int ioctl_fiemap(struct file *filp, unsigned long arg) { struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; @@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) if (!inode->i_op->fiemap) return -EOPNOTSUPP; - if (copy_from_user(&fiemap, (struct fiemap __user *)arg, - sizeof(struct fiemap))) + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) @@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; - fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); + fieinfo.fi_extents_start = ufiemap->fm_extents; if (fiemap.fm_extent_count != 0 && !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, @@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; - if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bfdeb82a53b..a0f3833c0db 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -26,16 +26,32 @@ #define BEQUIET -static int isofs_hashi(struct dentry *parent, struct qstr *qstr); -static int isofs_hash(struct dentry *parent, struct qstr *qstr); -static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b); -static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b); +static int isofs_hashi(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_hash(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_dentry_cmpi(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET -static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr); -static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr); -static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); -static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); +static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode, + struct qstr *qstr); +static int isofs_dentry_cmpi_ms(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); +static int isofs_dentry_cmp_ms(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name); #endif static void isofs_put_super(struct super_block *sb) @@ -65,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void isofs_destroy_inode(struct inode *inode) +static void isofs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } +static void isofs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, isofs_i_callback); +} + static void init_once(void *foo) { struct iso_inode_info *ei = foo; @@ -160,7 +183,7 @@ struct iso9660_options{ * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms) +isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -181,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms) * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) +isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -206,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) } /* - * Case insensitive compare of two isofs names. + * Compare of two isofs names. */ -static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a, - struct qstr *b, int ms) +static int isofs_dentry_cmp_common( + unsigned int len, const char *str, + const struct qstr *name, int ms, int ci) { int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ - alen = a->len; - blen = b->len; + alen = name->len; + blen = len; if (ms) { - while (alen && a->name[alen-1] == '.') + while (alen && name->name[alen-1] == '.') alen--; - while (blen && b->name[blen-1] == '.') + while (blen && str[blen-1] == '.') blen--; } if (alen == blen) { - if (strnicmp(a->name, b->name, alen) == 0) - return 0; - } - return 1; -} - -/* - * Case sensitive compare of two isofs names. - */ -static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a, - struct qstr *b, int ms) -{ - int alen, blen; - - /* A filename cannot end in '.' or we treat it like it has none */ - alen = a->len; - blen = b->len; - if (ms) { - while (alen && a->name[alen-1] == '.') - alen--; - while (blen && b->name[blen-1] == '.') - blen--; - } - if (alen == blen) { - if (strncmp(a->name, b->name, alen) == 0) - return 0; + if (ci) { + if (strnicmp(name->name, str, alen) == 0) + return 0; + } else { + if (strncmp(name->name, str, alen) == 0) + return 0; + } } return 1; } static int -isofs_hash(struct dentry *dentry, struct qstr *qstr) +isofs_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 0); } static int -isofs_hashi(struct dentry *dentry, struct qstr *qstr) +isofs_hashi(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 0); } static int -isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmp_common(dentry, a, b, 0); + return isofs_dentry_cmp_common(len, str, name, 0, 0); } static int -isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmpi_common(dentry, a, b, 0); + return isofs_dentry_cmp_common(len, str, name, 0, 1); } #ifdef CONFIG_JOLIET static int -isofs_hash_ms(struct dentry *dentry, struct qstr *qstr) +isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 1); } static int -isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr) +isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 1); } static int -isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmp_common(dentry, a, b, 1); + return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int -isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) +isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - return isofs_dentry_cmpi_common(dentry, a, b, 1); + return isofs_dentry_cmp_common(len, str, name, 1, 1); } #endif @@ -922,17 +939,18 @@ root_found: goto out_iput; } - /* get the root dentry */ - s->s_root = d_alloc_root(inode); - if (!(s->s_root)) - goto out_no_root; - table = 0; if (joliet_level) table += 2; if (opt.check == 'r') table++; - s->s_root->d_op = &isofs_dentry_ops[table]; + + s->s_d_op = &isofs_dentry_ops[table]; + + /* get the root dentry */ + s->s_root = d_alloc_root(inode); + if (!(s->s_root)) + goto out_no_root; kfree(opt.iocharset); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 0d23abfd428..4fb3e8074fd 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen) qstr.name = compare; qstr.len = dlen; - return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr); + return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, + dentry->d_name.len, dentry->d_name.name, &qstr); } /* @@ -171,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam struct inode *inode; struct page *page; - dentry->d_op = dir->i_sb->s_root->d_op; - page = alloc_page(GFP_USER); if (!page) return ERR_PTR(-ENOMEM); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 846a3f31411..5b2e4c30a2a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -207,7 +207,7 @@ repeat_locked: * the committing transaction. Really, we only need to give it * committing_transaction->t_outstanding_credits plus "enough" for * the log control blocks. - * Also, this test is inconsitent with the matching one in + * Also, this test is inconsistent with the matching one in * journal_extend(). */ if (__log_space_left(journal) < jbd_space_needed(journal)) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f837ba95352..9e4686900f1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include <linux/vmalloc.h> #include <linux/backing-dev.h> #include <linux/bitops.h> +#include <linux/ratelimit.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -93,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); +EXPORT_SYMBOL(jbd2_inode_cache); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -827,7 +829,7 @@ static journal_t * journal_init_common (void) journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - goto fail; + return NULL; init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_logspace); @@ -852,14 +854,12 @@ static journal_t * journal_init_common (void) err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); if (err) { kfree(journal); - goto fail; + return NULL; } spin_lock_init(&journal->j_history_lock); return journal; -fail: - return NULL; } /* jbd2_journal_init_dev and jbd2_journal_init_inode: @@ -1982,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void) static struct journal_head *journal_alloc_journal_head(void) { struct journal_head *ret; - static unsigned long last_warning; #ifdef CONFIG_JBD2_DEBUG atomic_inc(&nr_journal_heads); @@ -1990,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); - if (time_after(jiffies, last_warning + 5*HZ)) { - printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __func__); - last_warning = jiffies; - } + pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); while (!ret) { yield(); ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); @@ -2292,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) #endif -struct kmem_cache *jbd2_handle_cache; +struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; static int __init journal_init_handle_cache(void) { - jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", - sizeof(handle_t), - 0, /* offset */ - SLAB_TEMPORARY, /* flags */ - NULL); /* ctor */ + jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); if (jbd2_handle_cache == NULL) { - printk(KERN_EMERG "JBD: failed to create handle cache\n"); + printk(KERN_EMERG "JBD2: failed to create handle cache\n"); + return -ENOMEM; + } + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (jbd2_inode_cache == NULL) { + printk(KERN_EMERG "JBD2: failed to create inode cache\n"); + kmem_cache_destroy(jbd2_handle_cache); return -ENOMEM; } return 0; @@ -2312,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void) { if (jbd2_handle_cache) kmem_cache_destroy(jbd2_handle_cache); + if (jbd2_inode_cache) + kmem_cache_destroy(jbd2_inode_cache); + } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 2bc4d5f116f..1cad869494f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); -#endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); +#endif journal->j_transaction_sequence = ++info.end_transaction; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6bf0a242613..faad2bd787c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -251,7 +251,7 @@ repeat: * the committing transaction. Really, we only need to give it * committing_transaction->t_outstanding_credits plus "enough" for * the log control blocks. - * Also, this test is inconsitent with the matching one in + * Also, this test is inconsistent with the matching one in * jbd2_journal_extend(). */ if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { @@ -340,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) jbd2_free_handle(handle); current->journal_info = NULL; handle = ERR_PTR(err); - goto out; } -out: return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -589,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, transaction = handle->h_transaction; journal = transaction->t_journal; - jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -774,7 +772,7 @@ done: J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; - offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; + offset = offset_in_page(jh2bh(jh)->b_data); source = kmap_atomic(page, KM_USER0); /* Fire data frozen trigger just before we copy the data */ jbd2_buffer_frozen_trigger(jh, source + offset, diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 54a92fd02bb..95b79672150 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int rc; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 5e42de8d954..3119f59253d 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_check_acl(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int, unsigned int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 85c6be2db02..3005ec4520a 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; #ifndef __ECOS if (jffs2_blocks_use_vmalloc(c)) - c->blocks = vmalloc(size); + c->blocks = vzalloc(size); else #endif - c->blocks = kmalloc(size, GFP_KERNEL); + c->blocks = kzalloc(size, GFP_KERNEL); if (!c->blocks) return -ENOMEM; - memset(c->blocks, 0, size); for (i=0; i<c->nr_blocks; i++) { INIT_LIST_HEAD(&c->blocks[i].list); c->blocks[i].offset = i * c->sector_size; diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index f864005de64..0bc6a6c80a5 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -144,4 +144,4 @@ struct jffs2_sb_info { void *os_priv; }; -#endif /* _JFFS2_FB_SB */ +#endif /* _JFFS2_FS_SB */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index c86041b866a..853b8e30008 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) return &f->vfs_inode; } -static void jffs2_destroy_inode(struct inode *inode) +static void jffs2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); } +static void jffs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, jffs2_i_callback); +} + static void jffs2_i_init_once(void *foo) { struct jffs2_inode_info *f = foo; diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 9b572ca40a4..4f9cc048294 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", offset, je32_to_cpu(rx.hdr_crc), crc); xd->flags |= JFFS2_XFLAGS_INVALID; - return EIO; + return -EIO; } totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK @@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat je32_to_cpu(rx.xid), xd->xid, je32_to_cpu(rx.version), xd->version); xd->flags |= JFFS2_XFLAGS_INVALID; - return EIO; + return -EIO; } xd->xprefix = rx.xprefix; xd->name_len = rx.name_len; @@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum ref_offset(xd->node), xd->data_crc, crc); kfree(data); xd->flags |= JFFS2_XFLAGS_INVALID; - return EIO; + return -EIO; } xd->flags |= JFFS2_XFLAGS_HOT; @@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x if (xd->xname) return 0; if (xd->flags & JFFS2_XFLAGS_INVALID) - return EIO; + return -EIO; if (unlikely(is_xattr_datum_unchecked(c, xd))) rc = do_verify_xattr_datum(c, xd); if (!rc) @@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref if (crc != je32_to_cpu(rr.node_crc)) { JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", offset, je32_to_cpu(rr.node_crc), crc); - return EIO; + return -EIO; } if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF @@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, je32_to_cpu(rr.totlen), PAD(sizeof(rr))); - return EIO; + return -EIO; } ref->ino = je32_to_cpu(rr.ino); ref->xid = je32_to_cpu(rr.xid); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1057a4998e4..e5de9422fa3 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -114,10 +114,14 @@ out: return rc; } -int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 54e07559878..f9285c4900f 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_check_acl(struct inode *, int); +int jfs_check_acl(struct inode *, int, unsigned int flags); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_acl_chmod(struct inode *inode); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index e1b8493b9aa..278e3fb40b7 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); if (IS_ERR(bdev)) { rc = -PTR_ERR(bdev); goto free; } - if ((rc = bd_claim(bdev, log))) { - goto close; - } - log->bdev = bdev; memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); @@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb) * initialize log: */ if ((rc = lmLogInit(log))) - goto unclaim; + goto close; list_add(&log->journal_list, &jfs_external_logs); @@ -1163,11 +1160,8 @@ journal_found: list_del(&log->journal_list); lbmLogShutdown(log); - unclaim: - bd_release(bdev); - close: /* close external log device */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb) bdev = log->bdev; rc = lmLogShutdown(log); - bd_release(bdev); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); kfree(log); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 231ca4af9bc..81ead850ddb 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -18,6 +18,7 @@ */ #include <linux/fs.h> +#include <linux/namei.h> #include <linux/ctype.h> #include <linux/quotaops.h> #include <linux/exportfs.h> @@ -1464,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc jfs_info("jfs_lookup: name = %s", name); - if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2) - dentry->d_op = &jfs_ci_dentry_operations; - if ((name[0] == '.') && (len == 1)) inum = dip->i_ino; else if (strcmp(name, "..") == 0) @@ -1491,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc return ERR_CAST(ip); } - dentry = d_splice_alias(ip, dentry); - - if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)) - dentry->d_op = &jfs_ci_dentry_operations; - - return dentry; + return d_splice_alias(ip, dentry); } static struct inode *jfs_nfs_get_inode(struct super_block *sb, @@ -1573,7 +1566,8 @@ const struct file_operations jfs_dir_operations = { .llseek = generic_file_llseek, }; -static int jfs_ci_hash(struct dentry *dir, struct qstr *this) +static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, + struct qstr *this) { unsigned long hash; int i; @@ -1586,32 +1580,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this) return 0; } -static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b) +static int jfs_ci_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; - if (a->len != b->len) + if (len != name->len) goto out; - for (i=0; i < a->len; i++) { - if (tolower(a->name[i]) != tolower(b->name[i])) + for (i=0; i < len; i++) { + if (tolower(str[i]) != tolower(name->name[i])) goto out; } result = 0; +out: + return result; +} +static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) + return -ECHILD; /* - * We want creates to preserve case. A negative dentry, a, that - * has a different case than b may cause a new entry to be created - * with the wrong case. Since we can't tell if a comes from a negative - * dentry, we blindly replace it with b. This should be harmless if - * a is not a negative dentry. + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, + * and will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. */ - memcpy((unsigned char *)a->name, b->name, a->len); -out: - return result; + if (dentry->d_inode) + return 1; + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!nd) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + } + return 1; } const struct dentry_operations jfs_ci_dentry_operations = { .d_hash = jfs_ci_hash, .d_compare = jfs_ci_compare, + .d_revalidate = jfs_ci_revalidate, }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0669fc1cc3b..eeca48a031a 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb) return &jfs_inode->vfs_inode; } +static void jfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct jfs_inode_info *ji = JFS_IP(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(jfs_inode_cachep, ji); +} + static void jfs_destroy_inode(struct inode *inode) { struct jfs_inode_info *ji = JFS_IP(inode); @@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode) ji->active_ag = -1; } spin_unlock_irq(&ji->ag_lock); - kmem_cache_free(jfs_inode_cachep, ji); + call_rcu(&inode->i_rcu, jfs_i_callback); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -507,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = JFS_SUPER_MAGIC; + if (sbi->mntflag & JFS_OS2) + sb->s_d_op = &jfs_ci_dentry_operations; + inode = jfs_iget(sb, ROOT_I); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -516,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) goto out_no_root; - if (sbi->mntflag & JFS_OS2) - sb->s_root->d_op = &jfs_ci_dentry_operations; - /* logical blocks are represented by 40 bits in pxd_t, etc. */ sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; #if BITS_PER_LONG == 32 diff --git a/fs/libfs.c b/fs/libfs.c index a3accdf528a..c88eab55aec 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -16,6 +16,11 @@ #include <asm/uaccess.h> +static inline int simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf) * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ -static int simple_delete_dentry(struct dentry *dentry) +static int simple_delete_dentry(const struct dentry *dentry) { return 1; } @@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - dentry->d_op = &simple_dentry_operations; + d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, NULL); return NULL; } @@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file) loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) { - mutex_lock(&file->f_path.dentry->d_inode->i_mutex); + struct dentry *dentry = file->f_path.dentry; + mutex_lock(&dentry->d_inode->i_mutex); switch (origin) { case 1: offset += file->f_pos; @@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) if (offset >= 0) break; default: - mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) struct dentry *cursor = file->private_data; loff_t n = file->f_pos - 2; - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + /* d_lock not required for cursor */ list_del(&cursor->d_u.d_child); - p = file->f_path.dentry->d_subdirs.next; - while (n && p != &file->f_path.dentry->d_subdirs) { + p = dentry->d_subdirs.next; + while (n && p != &dentry->d_subdirs) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); - if (!d_unhashed(next) && next->d_inode) + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(next)) n--; + spin_unlock(&next->d_lock); p = p->next; } list_add_tail(&cursor->d_u.d_child, p); - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); } } - mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); return offset; } @@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (filp->f_pos == 2) list_move(q, &dentry->d_subdirs); for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); - if (d_unhashed(next) || !next->d_inode) + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); continue; + } - spin_unlock(&dcache_lock); + spin_unlock(&next->d_lock); + spin_unlock(&dentry->d_lock); if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0) return 0; - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); /* next is still alive */ list_move(q, p); + spin_unlock(&next->d_lock); p = q; filp->f_pos++; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); } return 0; } @@ -202,7 +217,8 @@ static const struct super_operations simple_super_operations = { * will never be mountable) */ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, - const struct super_operations *ops, unsigned long magic) + const struct super_operations *ops, + const struct dentry_operations *dops, unsigned long magic) { struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); struct dentry *dentry; @@ -239,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, dentry->d_parent = dentry; d_instantiate(dentry, root); s->s_root = dentry; + s->s_d_op = dops; s->s_flags |= MS_ACTIVE; return dget(s->s_root); @@ -259,23 +276,23 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den return 0; } -static inline int simple_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; - spin_lock(&dcache_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) - if (simple_positive(child)) + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) { + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); goto out; + } + spin_unlock(&child->d_lock); + } ret = 1; out: - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return ret; } diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 97f6073ab33..ca58d64374c 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_LOCKD) += lockd.o -lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ - svcproc.o svcsubs.o mon.o xdr.o grace.o -lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o +lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ + svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o +lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c new file mode 100644 index 00000000000..f848b52c67b --- /dev/null +++ b/fs/lockd/clnt4xdr.c @@ -0,0 +1,605 @@ +/* + * linux/fs/lockd/clnt4xdr.c + * + * XDR functions to encode/decode NLM version 4 RPC arguments and results. + * + * NLM client-side only. + * + * Copyright (C) 2010, Oracle. All rights reserved. + */ + +#include <linux/types.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/lockd/lockd.h> + +#define NLMDBG_FACILITY NLMDBG_XDR + +#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) +# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" +#endif + +#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN) +# error "NLM host name cannot be larger than NLM's maximum string length!" +#endif + +/* + * Declare the space requirements for NLM arguments and replies as + * number of 32bit-words + */ +#define NLM4_void_sz (0) +#define NLM4_cookie_sz (1+(NLM_MAXCOOKIELEN>>2)) +#define NLM4_caller_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM4_owner_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM4_fhandle_sz (1+(NFS3_FHSIZE>>2)) +#define NLM4_lock_sz (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz) +#define NLM4_holder_sz (6+NLM4_owner_sz) + +#define NLM4_testargs_sz (NLM4_cookie_sz+1+NLM4_lock_sz) +#define NLM4_lockargs_sz (NLM4_cookie_sz+4+NLM4_lock_sz) +#define NLM4_cancargs_sz (NLM4_cookie_sz+2+NLM4_lock_sz) +#define NLM4_unlockargs_sz (NLM4_cookie_sz+NLM4_lock_sz) + +#define NLM4_testres_sz (NLM4_cookie_sz+1+NLM4_holder_sz) +#define NLM4_res_sz (NLM4_cookie_sz+1) +#define NLM4_norep_sz (0) + + +static s64 loff_t_to_s64(loff_t offset) +{ + s64 res; + + if (offset >= NLM4_OFFSET_MAX) + res = NLM4_OFFSET_MAX; + else if (offset <= -NLM4_OFFSET_MAX) + res = -NLM4_OFFSET_MAX; + else + res = offset; + return res; +} + +static void nlm4_compute_offsets(const struct nlm_lock *lock, + u64 *l_offset, u64 *l_len) +{ + const struct file_lock *fl = &lock->fl; + + BUG_ON(fl->fl_start > NLM4_OFFSET_MAX); + BUG_ON(fl->fl_end > NLM4_OFFSET_MAX && + fl->fl_end != OFFSET_MAX); + + *l_offset = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + *l_len = 0; + else + *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("lockd: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NLMv4 basic data types + * + * Basic NLMv4 data types are defined in Appendix II, section 6.1.4 + * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter + * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_bool(struct xdr_stream *xdr, const int value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = value ? xdr_one : xdr_zero; +} + +static void encode_int32(struct xdr_stream *xdr, const s32 value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +/* + * typedef opaque netobj<MAXNETOBJ_SZ> + */ +static void encode_netobj(struct xdr_stream *xdr, + const u8 *data, const unsigned int length) +{ + __be32 *p; + + BUG_ON(length > XDR_MAX_NETOBJ); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, data, length); +} + +static int decode_netobj(struct xdr_stream *xdr, + struct xdr_netobj *obj) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > XDR_MAX_NETOBJ)) + goto out_size; + obj->len = length; + obj->data = (u8 *)p; + return 0; +out_size: + dprintk("NFS: returned netobj was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj cookie; + */ +static void encode_cookie(struct xdr_stream *xdr, + const struct nlm_cookie *cookie) +{ + BUG_ON(cookie->len > NLM_MAXCOOKIELEN); + encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); +} + +static int decode_cookie(struct xdr_stream *xdr, + struct nlm_cookie *cookie) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + /* apparently HPUX can return empty cookies */ + if (length == 0) + goto out_hpux; + if (length > NLM_MAXCOOKIELEN) + goto out_size; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + cookie->len = length; + memcpy(cookie->data, p, length); + return 0; +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return 0; +out_size: + dprintk("NFS: returned cookie was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj fh; + */ +static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + BUG_ON(fh->size > NFS3_FHSIZE); + encode_netobj(xdr, (u8 *)&fh->data, fh->size); +} + +/* + * enum nlm4_stats { + * NLM4_GRANTED = 0, + * NLM4_DENIED = 1, + * NLM4_DENIED_NOLOCKS = 2, + * NLM4_BLOCKED = 3, + * NLM4_DENIED_GRACE_PERIOD = 4, + * NLM4_DEADLCK = 5, + * NLM4_ROFS = 6, + * NLM4_STALE_FH = 7, + * NLM4_FBIG = 8, + * NLM4_FAILED = 9 + * }; + * + * struct nlm4_stat { + * nlm4_stats stat; + * }; + * + * NB: we don't swap bytes for the NLM status values. The upper + * layers deal directly with the status value in network byte + * order. + */ +static void encode_nlm4_stat(struct xdr_stream *xdr, + const __be32 stat) +{ + __be32 *p; + + BUG_ON(be32_to_cpu(stat) > NLM_FAILED); + p = xdr_reserve_space(xdr, 4); + *p = stat; +} + +static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (unlikely(*p > nlm4_failed)) + goto out_bad_xdr; + *stat = *p; + return 0; +out_bad_xdr: + dprintk("%s: server returned invalid nlm4_stats value: %u\n", + __func__, be32_to_cpup(p)); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * struct nlm4_holder { + * bool exclusive; + * int32 svid; + * netobj oh; + * uint64 l_offset; + * uint64 l_len; + * }; + */ +static void encode_nlm4_holder(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + const struct nlm_lock *lock = &result->lock; + u64 l_offset, l_len; + __be32 *p; + + encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_int32(xdr, lock->svid); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4); + nlm4_compute_offsets(lock, &l_offset, &l_len); + p = xdr_encode_hyper(p, l_offset); + xdr_encode_hyper(p, l_len); +} + +static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) +{ + struct nlm_lock *lock = &result->lock; + struct file_lock *fl = &lock->fl; + u64 l_offset, l_len; + u32 exclusive; + int error; + __be32 *p; + s32 end; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(fl); + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + exclusive = be32_to_cpup(p++); + lock->svid = be32_to_cpup(p); + fl->fl_pid = (pid_t)lock->svid; + + error = decode_netobj(xdr, &lock->oh); + if (unlikely(error)) + goto out; + + p = xdr_inline_decode(xdr, 8 + 8); + if (unlikely(p == NULL)) + goto out_overflow; + + fl->fl_flags = FL_POSIX; + fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + p = xdr_decode_hyper(p, &l_offset); + xdr_decode_hyper(p, &l_len); + end = l_offset + l_len - 1; + + fl->fl_start = (loff_t)l_offset; + if (l_len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = (loff_t)end; + error = 0; +out: + return error; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * string caller_name<LM_MAXSTRLEN>; + */ +static void encode_caller_name(struct xdr_stream *xdr, const char *name) +{ + /* NB: client-side does not set lock->len */ + u32 length = strlen(name); + __be32 *p; + + BUG_ON(length > NLM_MAXSTRLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +/* + * struct nlm4_lock { + * string caller_name<LM_MAXSTRLEN>; + * netobj fh; + * netobj oh; + * int32 svid; + * uint64 l_offset; + * uint64 l_len; + * }; + */ +static void encode_nlm4_lock(struct xdr_stream *xdr, + const struct nlm_lock *lock) +{ + u64 l_offset, l_len; + __be32 *p; + + encode_caller_name(xdr, lock->caller); + encode_fh(xdr, &lock->fh); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 8 + 8); + *p++ = cpu_to_be32(lock->svid); + + nlm4_compute_offsets(lock, &l_offset, &l_len); + p = xdr_encode_hyper(p, l_offset); + xdr_encode_hyper(p, l_len); +} + + +/* + * NLMv4 XDR encode functions + * + * NLMv4 argument types are defined in Appendix II of RFC 1813: + * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * struct nlm4_testargs { + * netobj cookie; + * bool exclusive; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_testargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_lockargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm4_lock alock; + * bool reclaim; + * int state; + * }; + */ +static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); + encode_bool(xdr, args->reclaim); + encode_int32(xdr, args->state); +} + +/* + * struct nlm4_cancargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_unlockargs { + * netobj cookie; + * struct nlm4_lock alock; + * }; + */ +static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_nlm4_lock(xdr, lock); +} + +/* + * struct nlm4_res { + * netobj cookie; + * nlm4_stat stat; + * }; + */ +static void nlm4_xdr_enc_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm4_stat(xdr, result->status); +} + +/* + * union nlm4_testrply switch (nlm4_stats stat) { + * case NLM4_DENIED: + * struct nlm4_holder holder; + * default: + * void; + * }; + * + * struct nlm4_testres { + * netobj cookie; + * nlm4_testrply test_stat; + * }; + */ +static void nlm4_xdr_enc_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm4_stat(xdr, result->status); + if (result->status == nlm_lck_denied) + encode_nlm4_holder(xdr, result); +} + + +/* + * NLMv4 XDR decode functions + * + * NLMv4 argument types are defined in Appendix II of RFC 1813: + * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * union nlm4_testrply switch (nlm4_stats stat) { + * case NLM4_DENIED: + * struct nlm4_holder holder; + * default: + * void; + * }; + * + * struct nlm4_testres { + * netobj cookie; + * nlm4_testrply test_stat; + * }; + */ +static int decode_nlm4_testrply(struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_nlm4_stat(xdr, &result->status); + if (unlikely(error)) + goto out; + if (result->status == nlm_lck_denied) + error = decode_nlm4_holder(xdr, result); +out: + return error; +} + +static int nlm4_xdr_dec_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm4_testrply(xdr, result); +out: + return error; +} + +/* + * struct nlm4_res { + * netobj cookie; + * nlm4_stat stat; + * }; + */ +static int nlm4_xdr_dec_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm4_stat(xdr, &result->status); +out: + return error; +} + + +/* + * For NLM, a void procedure really returns nothing + */ +#define nlm4_xdr_dec_norep NULL + +#define PROC(proc, argtype, restype) \ +[NLMPROC_##proc] = { \ + .p_proc = NLMPROC_##proc, \ + .p_encode = (kxdreproc_t)nlm4_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nlm4_xdr_dec_##restype, \ + .p_arglen = NLM4_##argtype##_sz, \ + .p_replen = NLM4_##restype##_sz, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ + } + +static struct rpc_procinfo nlm4_procedures[] = { + PROC(TEST, testargs, testres), + PROC(LOCK, lockargs, res), + PROC(CANCEL, cancargs, res), + PROC(UNLOCK, unlockargs, res), + PROC(GRANTED, testargs, res), + PROC(TEST_MSG, testargs, norep), + PROC(LOCK_MSG, lockargs, norep), + PROC(CANCEL_MSG, cancargs, norep), + PROC(UNLOCK_MSG, unlockargs, norep), + PROC(GRANTED_MSG, testargs, norep), + PROC(TEST_RES, testres, norep), + PROC(LOCK_RES, res, norep), + PROC(CANCEL_RES, res, norep), + PROC(UNLOCK_RES, res, norep), + PROC(GRANTED_RES, res, norep), +}; + +struct rpc_version nlm_version4 = { + .number = 4, + .nrprocs = ARRAY_SIZE(nlm4_procedures), + .procs = nlm4_procedures, +}; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 25509eb28fd..8d4ea8351e3 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init); */ void nlmclnt_done(struct nlm_host *host) { - nlm_release_host(host); + nlmclnt_release_host(host); lockd_down(); } EXPORT_SYMBOL_GPL(nlmclnt_done); @@ -273,7 +273,7 @@ restart: spin_unlock(&nlm_blocked_lock); /* Release host handle after use */ - nlm_release_host(host); + nlmclnt_release_host(host); lockd_down(); return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 332c54cf75e..adb45ec9038 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -58,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner) return; list_del(&lockowner->list); spin_unlock(&lockowner->host->h_lock); - nlm_release_host(lockowner->host); + nlmclnt_release_host(lockowner->host); kfree(lockowner); } @@ -207,22 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) printk("nlm_alloc_call: failed, waiting for memory\n"); schedule_timeout_interruptible(5*HZ); } - nlm_release_host(host); + nlmclnt_release_host(host); return NULL; } -void nlm_release_call(struct nlm_rqst *call) +void nlmclnt_release_call(struct nlm_rqst *call) { if (!atomic_dec_and_test(&call->a_count)) return; - nlm_release_host(call->a_host); + nlmclnt_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); } static void nlmclnt_rpc_release(void *data) { - nlm_release_call(data); + nlmclnt_release_call(data); } static int nlm_wait_on_grace(wait_queue_head_t *queue) @@ -436,7 +436,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) status = nlm_stat_to_errno(req->a_res.status); } out: - nlm_release_call(req); + nlmclnt_release_call(req); return status; } @@ -593,7 +593,7 @@ again: out_unblock: nlmclnt_finish_block(block); out: - nlm_release_call(req); + nlmclnt_release_call(req); return status; out_unlock: /* Fatal error: ensure that we remove the lock altogether */ @@ -694,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) /* What to do now? I'm out of my depth... */ status = -ENOLCK; out: - nlm_release_call(req); + nlmclnt_release_call(req); return status; } @@ -755,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) status = -ENOLCK; - nlm_release_call(req); + nlmclnt_release_call(req); return status; } diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c new file mode 100644 index 00000000000..180ac34feb9 --- /dev/null +++ b/fs/lockd/clntxdr.c @@ -0,0 +1,627 @@ +/* + * linux/fs/lockd/clntxdr.c + * + * XDR functions to encode/decode NLM version 3 RPC arguments and results. + * NLM version 3 is backwards compatible with NLM versions 1 and 2. + * + * NLM client-side only. + * + * Copyright (C) 2010, Oracle. All rights reserved. + */ + +#include <linux/types.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/lockd/lockd.h> + +#define NLMDBG_FACILITY NLMDBG_XDR + +#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) +# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" +#endif + +/* + * Declare the space requirements for NLM arguments and replies as + * number of 32bit-words + */ +#define NLM_cookie_sz (1+(NLM_MAXCOOKIELEN>>2)) +#define NLM_caller_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM_owner_sz (1+(NLMCLNT_OHSIZE>>2)) +#define NLM_fhandle_sz (1+(NFS2_FHSIZE>>2)) +#define NLM_lock_sz (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz) +#define NLM_holder_sz (4+NLM_owner_sz) + +#define NLM_testargs_sz (NLM_cookie_sz+1+NLM_lock_sz) +#define NLM_lockargs_sz (NLM_cookie_sz+4+NLM_lock_sz) +#define NLM_cancargs_sz (NLM_cookie_sz+2+NLM_lock_sz) +#define NLM_unlockargs_sz (NLM_cookie_sz+NLM_lock_sz) + +#define NLM_testres_sz (NLM_cookie_sz+1+NLM_holder_sz) +#define NLM_res_sz (NLM_cookie_sz+1) +#define NLM_norep_sz (0) + + +static s32 loff_t_to_s32(loff_t offset) +{ + s32 res; + + if (offset >= NLM_OFFSET_MAX) + res = NLM_OFFSET_MAX; + else if (offset <= -NLM_OFFSET_MAX) + res = -NLM_OFFSET_MAX; + else + res = offset; + return res; +} + +static void nlm_compute_offsets(const struct nlm_lock *lock, + u32 *l_offset, u32 *l_len) +{ + const struct file_lock *fl = &lock->fl; + + BUG_ON(fl->fl_start > NLM_OFFSET_MAX); + BUG_ON(fl->fl_end > NLM_OFFSET_MAX && + fl->fl_end != OFFSET_MAX); + + *l_offset = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + *l_len = 0; + else + *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("lockd: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NLMv3 basic data types + * + * Basic NLMv3 data types are not defined in an IETF standards + * document. X/Open has a description of these data types that + * is useful. See Chapter 10 of "Protocols for Interworking: + * XNFS, Version 3W". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +static void encode_bool(struct xdr_stream *xdr, const int value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = value ? xdr_one : xdr_zero; +} + +static void encode_int32(struct xdr_stream *xdr, const s32 value) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); +} + +/* + * typedef opaque netobj<MAXNETOBJ_SZ> + */ +static void encode_netobj(struct xdr_stream *xdr, + const u8 *data, const unsigned int length) +{ + __be32 *p; + + BUG_ON(length > XDR_MAX_NETOBJ); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, data, length); +} + +static int decode_netobj(struct xdr_stream *xdr, + struct xdr_netobj *obj) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > XDR_MAX_NETOBJ)) + goto out_size; + obj->len = length; + obj->data = (u8 *)p; + return 0; +out_size: + dprintk("NFS: returned netobj was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj cookie; + */ +static void encode_cookie(struct xdr_stream *xdr, + const struct nlm_cookie *cookie) +{ + BUG_ON(cookie->len > NLM_MAXCOOKIELEN); + encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); +} + +static int decode_cookie(struct xdr_stream *xdr, + struct nlm_cookie *cookie) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + /* apparently HPUX can return empty cookies */ + if (length == 0) + goto out_hpux; + if (length > NLM_MAXCOOKIELEN) + goto out_size; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + cookie->len = length; + memcpy(cookie->data, p, length); + return 0; +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return 0; +out_size: + dprintk("NFS: returned cookie was too long: %u\n", length); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * netobj fh; + */ +static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + BUG_ON(fh->size != NFS2_FHSIZE); + encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE); +} + +/* + * enum nlm_stats { + * LCK_GRANTED = 0, + * LCK_DENIED = 1, + * LCK_DENIED_NOLOCKS = 2, + * LCK_BLOCKED = 3, + * LCK_DENIED_GRACE_PERIOD = 4 + * }; + * + * + * struct nlm_stat { + * nlm_stats stat; + * }; + * + * NB: we don't swap bytes for the NLM status values. The upper + * layers deal directly with the status value in network byte + * order. + */ + +static void encode_nlm_stat(struct xdr_stream *xdr, + const __be32 stat) +{ + __be32 *p; + + BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD); + p = xdr_reserve_space(xdr, 4); + *p = stat; +} + +static int decode_nlm_stat(struct xdr_stream *xdr, + __be32 *stat) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (unlikely(*p > nlm_lck_denied_grace_period)) + goto out_enum; + *stat = *p; + return 0; +out_enum: + dprintk("%s: server returned invalid nlm_stats value: %u\n", + __func__, be32_to_cpup(p)); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * struct nlm_holder { + * bool exclusive; + * int uppid; + * netobj oh; + * unsigned l_offset; + * unsigned l_len; + * }; + */ +static void encode_nlm_holder(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + const struct nlm_lock *lock = &result->lock; + u32 l_offset, l_len; + __be32 *p; + + encode_bool(xdr, lock->fl.fl_type == F_RDLCK); + encode_int32(xdr, lock->svid); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4); + nlm_compute_offsets(lock, &l_offset, &l_len); + *p++ = cpu_to_be32(l_offset); + *p = cpu_to_be32(l_len); +} + +static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) +{ + struct nlm_lock *lock = &result->lock; + struct file_lock *fl = &lock->fl; + u32 exclusive, l_offset, l_len; + int error; + __be32 *p; + s32 end; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(fl); + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + exclusive = be32_to_cpup(p++); + lock->svid = be32_to_cpup(p); + fl->fl_pid = (pid_t)lock->svid; + + error = decode_netobj(xdr, &lock->oh); + if (unlikely(error)) + goto out; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + + fl->fl_flags = FL_POSIX; + fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK; + l_offset = be32_to_cpup(p++); + l_len = be32_to_cpup(p); + end = l_offset + l_len - 1; + + fl->fl_start = (loff_t)l_offset; + if (l_len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = (loff_t)end; + error = 0; +out: + return error; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * string caller_name<LM_MAXSTRLEN>; + */ +static void encode_caller_name(struct xdr_stream *xdr, const char *name) +{ + /* NB: client-side does not set lock->len */ + u32 length = strlen(name); + __be32 *p; + + BUG_ON(length > NLM_MAXSTRLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +/* + * struct nlm_lock { + * string caller_name<LM_MAXSTRLEN>; + * netobj fh; + * netobj oh; + * int uppid; + * unsigned l_offset; + * unsigned l_len; + * }; + */ +static void encode_nlm_lock(struct xdr_stream *xdr, + const struct nlm_lock *lock) +{ + u32 l_offset, l_len; + __be32 *p; + + encode_caller_name(xdr, lock->caller); + encode_fh(xdr, &lock->fh); + encode_netobj(xdr, lock->oh.data, lock->oh.len); + + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(lock->svid); + + nlm_compute_offsets(lock, &l_offset, &l_len); + *p++ = cpu_to_be32(l_offset); + *p = cpu_to_be32(l_len); +} + + +/* + * NLMv3 XDR encode functions + * + * NLMv3 argument types are defined in Chapter 10 of The Open Group's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * struct nlm_testargs { + * netobj cookie; + * bool exclusive; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_testargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_lockargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm_lock alock; + * bool reclaim; + * int state; + * }; + */ +static void nlm_xdr_enc_lockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); + encode_bool(xdr, args->reclaim); + encode_int32(xdr, args->state); +} + +/* + * struct nlm_cancargs { + * netobj cookie; + * bool block; + * bool exclusive; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_cancargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_bool(xdr, args->block); + encode_bool(xdr, lock->fl.fl_type == F_WRLCK); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_unlockargs { + * netobj cookie; + * struct nlm_lock alock; + * }; + */ +static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_args *args) +{ + const struct nlm_lock *lock = &args->lock; + + encode_cookie(xdr, &args->cookie); + encode_nlm_lock(xdr, lock); +} + +/* + * struct nlm_res { + * netobj cookie; + * nlm_stat stat; + * }; + */ +static void nlm_xdr_enc_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm_stat(xdr, result->status); +} + +/* + * union nlm_testrply switch (nlm_stats stat) { + * case LCK_DENIED: + * struct nlm_holder holder; + * default: + * void; + * }; + * + * struct nlm_testres { + * netobj cookie; + * nlm_testrply test_stat; + * }; + */ +static void encode_nlm_testrply(struct xdr_stream *xdr, + const struct nlm_res *result) +{ + if (result->status == nlm_lck_denied) + encode_nlm_holder(xdr, result); +} + +static void nlm_xdr_enc_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nlm_res *result) +{ + encode_cookie(xdr, &result->cookie); + encode_nlm_stat(xdr, result->status); + encode_nlm_testrply(xdr, result); +} + + +/* + * NLMv3 XDR decode functions + * + * NLMv3 result types are defined in Chapter 10 of The Open Group's + * "Protocols for Interworking: XNFS, Version 3W". + */ + +/* + * union nlm_testrply switch (nlm_stats stat) { + * case LCK_DENIED: + * struct nlm_holder holder; + * default: + * void; + * }; + * + * struct nlm_testres { + * netobj cookie; + * nlm_testrply test_stat; + * }; + */ +static int decode_nlm_testrply(struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_nlm_stat(xdr, &result->status); + if (unlikely(error)) + goto out; + if (result->status == nlm_lck_denied) + error = decode_nlm_holder(xdr, result); +out: + return error; +} + +static int nlm_xdr_dec_testres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm_testrply(xdr, result); +out: + return error; +} + +/* + * struct nlm_res { + * netobj cookie; + * nlm_stat stat; + * }; + */ +static int nlm_xdr_dec_res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nlm_res *result) +{ + int error; + + error = decode_cookie(xdr, &result->cookie); + if (unlikely(error)) + goto out; + error = decode_nlm_stat(xdr, &result->status); +out: + return error; +} + + +/* + * For NLM, a void procedure really returns nothing + */ +#define nlm_xdr_dec_norep NULL + +#define PROC(proc, argtype, restype) \ +[NLMPROC_##proc] = { \ + .p_proc = NLMPROC_##proc, \ + .p_encode = (kxdreproc_t)nlm_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nlm_xdr_dec_##restype, \ + .p_arglen = NLM_##argtype##_sz, \ + .p_replen = NLM_##restype##_sz, \ + .p_statidx = NLMPROC_##proc, \ + .p_name = #proc, \ + } + +static struct rpc_procinfo nlm_procedures[] = { + PROC(TEST, testargs, testres), + PROC(LOCK, lockargs, res), + PROC(CANCEL, cancargs, res), + PROC(UNLOCK, unlockargs, res), + PROC(GRANTED, testargs, res), + PROC(TEST_MSG, testargs, norep), + PROC(LOCK_MSG, lockargs, norep), + PROC(CANCEL_MSG, cancargs, norep), + PROC(UNLOCK_MSG, unlockargs, norep), + PROC(GRANTED_MSG, testargs, norep), + PROC(TEST_RES, testres, norep), + PROC(LOCK_RES, res, norep), + PROC(CANCEL_RES, res, norep), + PROC(UNLOCK_RES, res, norep), + PROC(GRANTED_RES, res, norep), +}; + +static struct rpc_version nlm_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nlm_procedures), + .procs = nlm_procedures, +}; + +static struct rpc_version nlm_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(nlm_procedures), + .procs = nlm_procedures, +}; + +static struct rpc_version *nlm_versions[] = { + [1] = &nlm_version1, + [3] = &nlm_version3, +#ifdef CONFIG_LOCKD_V4 + [4] = &nlm_version4, +#endif +}; + +static struct rpc_stat nlm_rpc_stats; + +struct rpc_program nlm_program = { + .name = "lockd", + .number = NLM_PROGRAM, + .nrvers = ARRAY_SIZE(nlm_versions), + .version = nlm_versions, + .stats = &nlm_rpc_stats, +}; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index ed0c59fe23c..5f1bcb2f06f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -25,9 +25,22 @@ #define NLM_HOST_EXPIRE (300 * HZ) #define NLM_HOST_COLLECT (120 * HZ) -static struct hlist_head nlm_hosts[NLM_HOST_NRHASH]; +static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH]; +static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; + +#define for_each_host(host, pos, chain, table) \ + for ((chain) = (table); \ + (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ + hlist_for_each_entry((host), (pos), (chain), h_hash) + +#define for_each_host_safe(host, pos, next, chain, table) \ + for ((chain) = (table); \ + (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ + hlist_for_each_entry_safe((host), (pos), (next), \ + (chain), h_hash) + static unsigned long next_gc; -static int nrhosts; +static unsigned long nrhosts; static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); @@ -40,8 +53,6 @@ struct nlm_lookup_host_info { const u32 version; /* NLM version to search for */ const char *hostname; /* remote's hostname */ const size_t hostname_len; /* it's length */ - const struct sockaddr *src_sap; /* our address (optional) */ - const size_t src_len; /* it's length */ const int noresvport; /* use non-priv port */ }; @@ -88,127 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap) } /* - * Common host lookup routine for server & client + * Allocate and initialize an nlm_host. Common to both client and server. */ -static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) +static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, + struct nsm_handle *nsm) { - struct hlist_head *chain; - struct hlist_node *pos; - struct nlm_host *host; - struct nsm_handle *nsm = NULL; - - mutex_lock(&nlm_host_mutex); + struct nlm_host *host = NULL; + unsigned long now = jiffies; - if (time_after_eq(jiffies, next_gc)) - nlm_gc_hosts(); - - /* We may keep several nlm_host objects for a peer, because each - * nlm_host is identified by - * (address, protocol, version, server/client) - * We could probably simplify this a little by putting all those - * different NLM rpc_clients into one single nlm_host object. - * This would allow us to have one nlm_host per address. - */ - chain = &nlm_hosts[nlm_hash_address(ni->sap)]; - hlist_for_each_entry(host, pos, chain, h_hash) { - if (!rpc_cmp_addr(nlm_addr(host), ni->sap)) - continue; - - /* See if we have an NSM handle for this client */ - if (!nsm) - nsm = host->h_nsmhandle; - - if (host->h_proto != ni->protocol) - continue; - if (host->h_version != ni->version) - continue; - if (host->h_server != ni->server) - continue; - if (ni->server && ni->src_len != 0 && - !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) - continue; - - /* Move to head of hash chain. */ - hlist_del(&host->h_hash); - hlist_add_head(&host->h_hash, chain); - - nlm_get_host(host); - dprintk("lockd: nlm_lookup_host found host %s (%s)\n", - host->h_name, host->h_addrbuf); - goto out; - } - - /* - * The host wasn't in our hash table. If we don't - * have an NSM handle for it yet, create one. - */ - if (nsm) + if (nsm != NULL) atomic_inc(&nsm->sm_count); else { host = NULL; nsm = nsm_get_handle(ni->sap, ni->salen, ni->hostname, ni->hostname_len); - if (!nsm) { - dprintk("lockd: nlm_lookup_host failed; " - "no nsm handle\n"); + if (unlikely(nsm == NULL)) { + dprintk("lockd: %s failed; no nsm handle\n", + __func__); goto out; } } - host = kzalloc(sizeof(*host), GFP_KERNEL); - if (!host) { + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (unlikely(host == NULL)) { + dprintk("lockd: %s failed; no memory\n", __func__); nsm_release(nsm); - dprintk("lockd: nlm_lookup_host failed; no memory\n"); goto out; } - host->h_name = nsm->sm_name; - host->h_addrbuf = nsm->sm_addrbuf; + memcpy(nlm_addr(host), ni->sap, ni->salen); - host->h_addrlen = ni->salen; + host->h_addrlen = ni->salen; rpc_set_port(nlm_addr(host), 0); - memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); - host->h_srcaddrlen = ni->src_len; + host->h_srcaddrlen = 0; + + host->h_rpcclnt = NULL; + host->h_name = nsm->sm_name; host->h_version = ni->version; host->h_proto = ni->protocol; - host->h_rpcclnt = NULL; - mutex_init(&host->h_mutex); - host->h_nextrebind = jiffies + NLM_HOST_REBIND; - host->h_expires = jiffies + NLM_HOST_EXPIRE; - atomic_set(&host->h_count, 1); + host->h_reclaiming = 0; + host->h_server = ni->server; + host->h_noresvport = ni->noresvport; + host->h_inuse = 0; init_waitqueue_head(&host->h_gracewait); init_rwsem(&host->h_rwsem); - host->h_state = 0; /* pseudo NSM state */ - host->h_nsmstate = 0; /* real NSM state */ - host->h_nsmhandle = nsm; - host->h_server = ni->server; - host->h_noresvport = ni->noresvport; - hlist_add_head(&host->h_hash, chain); + host->h_state = 0; + host->h_nsmstate = 0; + host->h_pidcount = 0; + atomic_set(&host->h_count, 1); + mutex_init(&host->h_mutex); + host->h_nextrebind = now + NLM_HOST_REBIND; + host->h_expires = now + NLM_HOST_EXPIRE; INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); INIT_LIST_HEAD(&host->h_granted); INIT_LIST_HEAD(&host->h_reclaim); - - nrhosts++; - - dprintk("lockd: nlm_lookup_host created host %s\n", - host->h_name); + host->h_nsmhandle = nsm; + host->h_addrbuf = nsm->sm_addrbuf; out: - mutex_unlock(&nlm_host_mutex); return host; } /* - * Destroy a host + * Destroy an nlm_host and free associated resources + * + * Caller must hold nlm_host_mutex. */ -static void -nlm_destroy_host(struct nlm_host *host) +static void nlm_destroy_host_locked(struct nlm_host *host) { struct rpc_clnt *clnt; + dprintk("lockd: destroy host %s\n", host->h_name); + BUG_ON(!list_empty(&host->h_lockowners)); BUG_ON(atomic_read(&host->h_count)); + hlist_del_init(&host->h_hash); + nsm_unmonitor(host); nsm_release(host->h_nsmhandle); @@ -216,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host) if (clnt != NULL) rpc_shutdown_client(clnt); kfree(host); + + nrhosts--; } /** @@ -249,12 +218,76 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .hostname_len = strlen(hostname), .noresvport = noresvport, }; + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + struct nsm_handle *nsm = NULL; dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, (hostname ? hostname : "<none>"), version, (protocol == IPPROTO_UDP ? "udp" : "tcp")); - return nlm_lookup_host(&ni); + mutex_lock(&nlm_host_mutex); + + chain = &nlm_client_hosts[nlm_hash_address(sap)]; + hlist_for_each_entry(host, pos, chain, h_hash) { + if (!rpc_cmp_addr(nlm_addr(host), sap)) + continue; + + /* Same address. Share an NSM handle if we already have one */ + if (nsm == NULL) + nsm = host->h_nsmhandle; + + if (host->h_proto != protocol) + continue; + if (host->h_version != version) + continue; + + nlm_get_host(host); + dprintk("lockd: %s found host %s (%s)\n", __func__, + host->h_name, host->h_addrbuf); + goto out; + } + + host = nlm_alloc_host(&ni, nsm); + if (unlikely(host == NULL)) + goto out; + + hlist_add_head(&host->h_hash, chain); + nrhosts++; + + dprintk("lockd: %s created host %s (%s)\n", __func__, + host->h_name, host->h_addrbuf); + +out: + mutex_unlock(&nlm_host_mutex); + return host; +} + +/** + * nlmclnt_release_host - release client nlm_host + * @host: nlm_host to release + * + */ +void nlmclnt_release_host(struct nlm_host *host) +{ + if (host == NULL) + return; + + dprintk("lockd: release client host %s\n", host->h_name); + + BUG_ON(atomic_read(&host->h_count) < 0); + BUG_ON(host->h_server); + + if (atomic_dec_and_test(&host->h_count)) { + BUG_ON(!list_empty(&host->h_lockowners)); + BUG_ON(!list_empty(&host->h_granted)); + BUG_ON(!list_empty(&host->h_reclaim)); + + mutex_lock(&nlm_host_mutex); + nlm_destroy_host_locked(host); + mutex_unlock(&nlm_host_mutex); + } } /** @@ -279,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, const char *hostname, const size_t hostname_len) { + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host = NULL; + struct nsm_handle *nsm = NULL; struct sockaddr_in sin = { .sin_family = AF_INET, }; struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, }; + struct sockaddr *src_sap; + size_t src_len = rqstp->rq_addrlen; struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -293,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, .version = rqstp->rq_vers, .hostname = hostname, .hostname_len = hostname_len, - .src_len = rqstp->rq_addrlen, }; dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + mutex_lock(&nlm_host_mutex); + switch (ni.sap->sa_family) { case AF_INET: sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; - ni.src_sap = (struct sockaddr *)&sin; + src_sap = (struct sockaddr *)&sin; break; case AF_INET6: ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); - ni.src_sap = (struct sockaddr *)&sin6; + src_sap = (struct sockaddr *)&sin6; break; default: - return NULL; + dprintk("lockd: %s failed; unrecognized address family\n", + __func__); + goto out; + } + + if (time_after_eq(jiffies, next_gc)) + nlm_gc_hosts(); + + chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; + hlist_for_each_entry(host, pos, chain, h_hash) { + if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) + continue; + + /* Same address. Share an NSM handle if we already have one */ + if (nsm == NULL) + nsm = host->h_nsmhandle; + + if (host->h_proto != ni.protocol) + continue; + if (host->h_version != ni.version) + continue; + if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap)) + continue; + + /* Move to head of hash chain. */ + hlist_del(&host->h_hash); + hlist_add_head(&host->h_hash, chain); + + nlm_get_host(host); + dprintk("lockd: %s found host %s (%s)\n", + __func__, host->h_name, host->h_addrbuf); + goto out; } - return nlm_lookup_host(&ni); + host = nlm_alloc_host(&ni, nsm); + if (unlikely(host == NULL)) + goto out; + + memcpy(nlm_srcaddr(host), src_sap, src_len); + host->h_srcaddrlen = src_len; + hlist_add_head(&host->h_hash, chain); + nrhosts++; + + dprintk("lockd: %s created host %s (%s)\n", + __func__, host->h_name, host->h_addrbuf); + +out: + mutex_unlock(&nlm_host_mutex); + return host; +} + +/** + * nlmsvc_release_host - release server nlm_host + * @host: nlm_host to release + * + * Host is destroyed later in nlm_gc_host(). + */ +void nlmsvc_release_host(struct nlm_host *host) +{ + if (host == NULL) + return; + + dprintk("lockd: release server host %s\n", host->h_name); + + BUG_ON(atomic_read(&host->h_count) < 0); + BUG_ON(!host->h_server); + atomic_dec(&host->h_count); } /* @@ -413,20 +516,28 @@ struct nlm_host * nlm_get_host(struct nlm_host *host) return host; } -/* - * Release NLM host after use - */ -void nlm_release_host(struct nlm_host *host) +static struct nlm_host *next_host_state(struct hlist_head *cache, + struct nsm_handle *nsm, + const struct nlm_reboot *info) { - if (host != NULL) { - dprintk("lockd: release host %s\n", host->h_name); - BUG_ON(atomic_read(&host->h_count) < 0); - if (atomic_dec_and_test(&host->h_count)) { - BUG_ON(!list_empty(&host->h_lockowners)); - BUG_ON(!list_empty(&host->h_granted)); - BUG_ON(!list_empty(&host->h_reclaim)); + struct nlm_host *host = NULL; + struct hlist_head *chain; + struct hlist_node *pos; + + mutex_lock(&nlm_host_mutex); + for_each_host(host, pos, chain, cache) { + if (host->h_nsmhandle == nsm + && host->h_nsmstate != info->state) { + host->h_nsmstate = info->state; + host->h_state++; + + nlm_get_host(host); + goto out; } } +out: + mutex_unlock(&nlm_host_mutex); + return host; } /** @@ -438,8 +549,6 @@ void nlm_release_host(struct nlm_host *host) */ void nlm_host_rebooted(const struct nlm_reboot *info) { - struct hlist_head *chain; - struct hlist_node *pos; struct nsm_handle *nsm; struct nlm_host *host; @@ -452,32 +561,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info) * lock for this. * To avoid processing a host several times, we match the nsmstate. */ -again: mutex_lock(&nlm_host_mutex); - for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry(host, pos, chain, h_hash) { - if (host->h_nsmhandle == nsm - && host->h_nsmstate != info->state) { - host->h_nsmstate = info->state; - host->h_state++; - - nlm_get_host(host); - mutex_unlock(&nlm_host_mutex); - - if (host->h_server) { - /* We're server for this guy, just ditch - * all the locks he held. */ - nlmsvc_free_host_resources(host); - } else { - /* He's the server, initiate lock recovery. */ - nlmclnt_recovery(host); - } - - nlm_release_host(host); - goto again; - } - } + while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) { + nlmsvc_free_host_resources(host); + nlmsvc_release_host(host); } - mutex_unlock(&nlm_host_mutex); + while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) { + nlmclnt_recovery(host); + nlmclnt_release_host(host); + } + nsm_release(nsm); } @@ -497,13 +589,11 @@ nlm_shutdown_hosts(void) /* First, make all hosts eligible for gc */ dprintk("lockd: nuking all hosts...\n"); - for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry(host, pos, chain, h_hash) { - host->h_expires = jiffies - 1; - if (host->h_rpcclnt) { - rpc_shutdown_client(host->h_rpcclnt); - host->h_rpcclnt = NULL; - } + for_each_host(host, pos, chain, nlm_server_hosts) { + host->h_expires = jiffies - 1; + if (host->h_rpcclnt) { + rpc_shutdown_client(host->h_rpcclnt); + host->h_rpcclnt = NULL; } } @@ -512,15 +602,13 @@ nlm_shutdown_hosts(void) mutex_unlock(&nlm_host_mutex); /* complain if any hosts are left */ - if (nrhosts) { + if (nrhosts != 0) { printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); - dprintk("lockd: %d hosts left:\n", nrhosts); - for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry(host, pos, chain, h_hash) { - dprintk(" %s (cnt %d use %d exp %ld)\n", - host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires); - } + dprintk("lockd: %lu hosts left:\n", nrhosts); + for_each_host(host, pos, chain, nlm_server_hosts) { + dprintk(" %s (cnt %d use %d exp %ld)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires); } } } @@ -538,29 +626,22 @@ nlm_gc_hosts(void) struct nlm_host *host; dprintk("lockd: host garbage collection\n"); - for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry(host, pos, chain, h_hash) - host->h_inuse = 0; - } + for_each_host(host, pos, chain, nlm_server_hosts) + host->h_inuse = 0; /* Mark all hosts that hold locks, blocks or shares */ nlmsvc_mark_resources(); - for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry_safe(host, pos, next, chain, h_hash) { - if (atomic_read(&host->h_count) || host->h_inuse - || time_before(jiffies, host->h_expires)) { - dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", - host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires); - continue; - } - dprintk("lockd: delete host %s\n", host->h_name); - hlist_del_init(&host->h_hash); - - nlm_destroy_host(host); - nrhosts--; + for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { + if (atomic_read(&host->h_count) || host->h_inuse + || time_before(jiffies, host->h_expires)) { + dprintk("nlm_gc_hosts skipping %s " + "(cnt %d use %d exp %ld)\n", + host->h_name, atomic_read(&host->h_count), + host->h_inuse, host->h_expires); + continue; } + nlm_destroy_host_locked(host); } next_gc = jiffies + NLM_HOST_COLLECT; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e0c91894964..23d7451b293 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -401,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm) * Status Monitor wire protocol. */ -static int encode_nsm_string(struct xdr_stream *xdr, const char *string) +static void encode_nsm_string(struct xdr_stream *xdr, const char *string) { const u32 len = strlen(string); __be32 *p; - if (unlikely(len > SM_MAXSTRLEN)) - return -EIO; - p = xdr_reserve_space(xdr, sizeof(u32) + len); - if (unlikely(p == NULL)) - return -EIO; + BUG_ON(len > SM_MAXSTRLEN); + p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, string, len); - return 0; } /* * "mon_name" specifies the host to be monitored. */ -static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) +static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) { - return encode_nsm_string(xdr, argp->mon_name); + encode_nsm_string(xdr, argp->mon_name); } /* @@ -429,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" * has changed. */ -static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) +static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) { - int status; __be32 *p; - status = encode_nsm_string(xdr, utsname()->nodename); - if (unlikely(status != 0)) - return status; - p = xdr_reserve_space(xdr, 3 * sizeof(u32)); - if (unlikely(p == NULL)) - return -EIO; - *p++ = htonl(argp->prog); - *p++ = htonl(argp->vers); - *p++ = htonl(argp->proc); - return 0; + encode_nsm_string(xdr, utsname()->nodename); + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(argp->prog); + *p++ = cpu_to_be32(argp->vers); + *p = cpu_to_be32(argp->proc); } /* * The "mon_id" argument specifies the non-private arguments * of an NSMPROC_MON or NSMPROC_UNMON call. */ -static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) +static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) { - int status; - - status = encode_mon_name(xdr, argp); - if (unlikely(status != 0)) - return status; - return encode_my_id(xdr, argp); + encode_mon_name(xdr, argp); + encode_my_id(xdr, argp); } /* @@ -465,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) * by the NSMPROC_MON call. This information will be supplied in the * NLMPROC_SM_NOTIFY call. */ -static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) +static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) { __be32 *p; p = xdr_reserve_space(xdr, SM_PRIV_SIZE); - if (unlikely(p == NULL)) - return -EIO; xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); - return 0; } -static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, - const struct nsm_args *argp) +static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nsm_args *argp) { - struct xdr_stream xdr; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - status = encode_mon_id(&xdr, argp); - if (unlikely(status)) - return status; - return encode_priv(&xdr, argp); + encode_mon_id(xdr, argp); + encode_priv(xdr, argp); } -static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, - const struct nsm_args *argp) +static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nsm_args *argp) { - struct xdr_stream xdr; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - return encode_mon_id(&xdr, argp); + encode_mon_id(xdr, argp); } -static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, - struct nsm_res *resp) +static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nsm_res *resp) { - struct xdr_stream xdr; + __be32 *p; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - p = xdr_inline_decode(&xdr, 2 * sizeof(u32)); + p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(p == NULL)) return -EIO; - resp->status = ntohl(*p++); - resp->state = ntohl(*p); + resp->status = be32_to_cpup(p++); + resp->state = be32_to_cpup(p); - dprintk("lockd: xdr_dec_stat_res status %d state %d\n", - resp->status, resp->state); + dprintk("lockd: %s status %d state %d\n", + __func__, resp->status, resp->state); return 0; } -static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, - struct nsm_res *resp) +static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nsm_res *resp) { - struct xdr_stream xdr; + __be32 *p; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - p = xdr_inline_decode(&xdr, sizeof(u32)); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; - resp->state = ntohl(*p); + resp->state = be32_to_cpup(p); - dprintk("lockd: xdr_dec_stat state %d\n", resp->state); + dprintk("lockd: %s state %d\n", __func__, resp->state); return 0; } @@ -542,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, static struct rpc_procinfo nsm_procedures[] = { [NSMPROC_MON] = { .p_proc = NSMPROC_MON, - .p_encode = (kxdrproc_t)xdr_enc_mon, - .p_decode = (kxdrproc_t)xdr_dec_stat_res, + .p_encode = (kxdreproc_t)nsm_xdr_enc_mon, + .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat_res, .p_arglen = SM_mon_sz, .p_replen = SM_monres_sz, .p_statidx = NSMPROC_MON, @@ -551,8 +525,8 @@ static struct rpc_procinfo nsm_procedures[] = { }, [NSMPROC_UNMON] = { .p_proc = NSMPROC_UNMON, - .p_encode = (kxdrproc_t)xdr_enc_unmon, - .p_decode = (kxdrproc_t)xdr_dec_stat, + .p_encode = (kxdreproc_t)nsm_xdr_enc_unmon, + .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat, .p_arglen = SM_mon_id_sz, .p_replen = SM_unmonres_sz, .p_statidx = NSMPROC_UNMON, diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 38d26119245..9a41fdc1951 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -51,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return 0; no_locks: - nlm_release_host(host); + nlmsvc_release_host(host); if (error) return error; return nlm_lck_denied_nolocks; @@ -92,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, else dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rc; } @@ -134,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rc; } @@ -164,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = nlmsvc_cancel_blocked(file, &argp->lock); dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -197,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = nlmsvc_unlock(file, &argp->lock); dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -229,7 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data) static void nlm4svc_callback_release(void *data) { - nlm_release_call(data); + nlmsvc_release_call(data); } static const struct rpc_call_ops nlm4svc_callback_ops = { @@ -261,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args stat = func(rqstp, argp, &call->a_res); if (stat != 0) { - nlm_release_call(call); + nlmsvc_release_call(call); return stat; } @@ -334,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = nlmsvc_share_file(host, file, argp); dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -367,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = nlmsvc_unshare_file(host, file, argp); dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -399,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, return rpc_success; nlmsvc_free_host_resources(host); - nlm_release_host(host); + nlmsvc_release_host(host); return rpc_success; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index ef5659b211e..6e31695d046 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -46,6 +46,7 @@ static void nlmsvc_remove_block(struct nlm_block *block); static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); static void nlmsvc_freegrantargs(struct nlm_rqst *call); static const struct rpc_call_ops nlmsvc_grant_ops; +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie); /* * The list of blocked locks to retry @@ -233,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, failed_free: kfree(block); failed: - nlm_release_call(call); + nlmsvc_release_call(call); return NULL; } @@ -266,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref) mutex_unlock(&file->f_mutex); nlmsvc_freegrantargs(block->b_call); - nlm_release_call(block->b_call); + nlmsvc_release_call(block->b_call); nlm_release_file(block->b_file); kfree(block->b_fl); kfree(block); @@ -934,3 +935,32 @@ nlmsvc_retry_blocked(void) return timeout; } + +#ifdef RPC_DEBUG +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + /* + * We can get away with a static buffer because we're only + * called with BKL held. + */ + static char buf[2*NLM_MAXCOOKIELEN+1]; + unsigned int i, len = sizeof(buf); + char *p = buf; + + len--; /* allow for trailing \0 */ + if (len < 3) + return "???"; + for (i = 0 ; i < cookie->len ; i++) { + if (len < 2) { + strcpy(p-3, "..."); + break; + } + sprintf(p, "%02x", cookie->data[i]); + p += 2; + len -= 2; + } + *p = '\0'; + + return buf; +} +#endif diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 0caea5310ac..d27aab11f32 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -80,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return 0; no_locks: - nlm_release_host(host); + nlmsvc_release_host(host); if (error) return error; return nlm_lck_denied_nolocks; @@ -122,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST status %d vers %d\n", ntohl(resp->status), rqstp->rq_vers); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rc; } @@ -164,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rc; } @@ -194,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -227,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -257,9 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) -task->tk_status); } +void nlmsvc_release_call(struct nlm_rqst *call) +{ + if (!atomic_dec_and_test(&call->a_count)) + return; + nlmsvc_release_host(call->a_host); + kfree(call); +} + static void nlmsvc_callback_release(void *data) { - nlm_release_call(data); + nlmsvc_release_call(data); } static const struct rpc_call_ops nlmsvc_callback_ops = { @@ -291,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args stat = func(rqstp, argp, &call->a_res); if (stat != 0) { - nlm_release_call(call); + nlmsvc_release_call(call); return stat; } @@ -366,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = cast_status(nlmsvc_share_file(host, file, argp)); dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -399,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); - nlm_release_host(host); + nlmsvc_release_host(host); nlm_release_file(file); return rpc_success; } @@ -431,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, return rpc_success; nlmsvc_free_host_resources(host); - nlm_release_host(host); + nlmsvc_release_host(host); return rpc_success; } diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index b583ab0a4cb..964666c68a8 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock) } /* - * Encode a lock as part of an NLM call - */ -static __be32 * -nlm_encode_lock(__be32 *p, struct nlm_lock *lock) -{ - struct file_lock *fl = &lock->fl; - __s32 start, len; - - if (!(p = xdr_encode_string(p, lock->caller)) - || !(p = nlm_encode_fh(p, &lock->fh)) - || !(p = nlm_encode_oh(p, &lock->oh))) - return NULL; - - if (fl->fl_start > NLM_OFFSET_MAX - || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX)) - return NULL; - - start = loff_t_to_s32(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); - - *p++ = htonl(lock->svid); - *p++ = htonl(start); - *p++ = htonl(len); - - return p; -} - -/* * Encode result of a TEST/TEST_MSG call */ static __be32 * @@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) { return xdr_ressize_check(rqstp, p); } - -/* - * Now, the client side XDR functions - */ -#ifdef NLMCLNT_SUPPORT_SHARES -static int -nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr) -{ - return 0; -} -#endif - -static int -nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm_encode_cookie(p, &argp->cookie))) - return -EIO; - *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero; - if (!(p = nlm_encode_lock(p, lock))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm_decode_cookie(p, &resp->cookie))) - return -EIO; - resp->status = *p++; - if (resp->status == nlm_lck_denied) { - struct file_lock *fl = &resp->lock.fl; - u32 excl; - s32 start, len, end; - - memset(&resp->lock, 0, sizeof(resp->lock)); - locks_init_lock(fl); - excl = ntohl(*p++); - resp->lock.svid = ntohl(*p++); - fl->fl_pid = (pid_t)resp->lock.svid; - if (!(p = nlm_decode_oh(p, &resp->lock.oh))) - return -EIO; - - fl->fl_flags = FL_POSIX; - fl->fl_type = excl? F_WRLCK : F_RDLCK; - start = ntohl(*p++); - len = ntohl(*p++); - end = start + len - 1; - - fl->fl_start = s32_to_loff_t(start); - if (len == 0 || end < 0) - fl->fl_end = OFFSET_MAX; - else - fl->fl_end = s32_to_loff_t(end); - } - return 0; -} - - -static int -nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm_encode_cookie(p, &argp->cookie))) - return -EIO; - *p++ = argp->block? xdr_one : xdr_zero; - *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero; - if (!(p = nlm_encode_lock(p, lock))) - return -EIO; - *p++ = argp->reclaim? xdr_one : xdr_zero; - *p++ = htonl(argp->state); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm_encode_cookie(p, &argp->cookie))) - return -EIO; - *p++ = argp->block? xdr_one : xdr_zero; - *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero; - if (!(p = nlm_encode_lock(p, lock))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm_encode_cookie(p, &argp->cookie))) - return -EIO; - if (!(p = nlm_encode_lock(p, lock))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm_encode_cookie(p, &resp->cookie))) - return -EIO; - *p++ = resp->status; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm_encode_testres(p, resp))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm_decode_cookie(p, &resp->cookie))) - return -EIO; - resp->status = *p++; - return 0; -} - -#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) -# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" -#endif - -/* - * Buffer requirements for NLM - */ -#define NLM_void_sz 0 -#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) -#define NLM_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE) -#define NLM_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE) -#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE) -#define NLM_lock_sz 3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz -#define NLM_holder_sz 4+NLM_owner_sz - -#define NLM_testargs_sz NLM_cookie_sz+1+NLM_lock_sz -#define NLM_lockargs_sz NLM_cookie_sz+4+NLM_lock_sz -#define NLM_cancargs_sz NLM_cookie_sz+2+NLM_lock_sz -#define NLM_unlockargs_sz NLM_cookie_sz+NLM_lock_sz - -#define NLM_testres_sz NLM_cookie_sz+1+NLM_holder_sz -#define NLM_res_sz NLM_cookie_sz+1 -#define NLM_norep_sz 0 - -/* - * For NLM, a void procedure really returns nothing - */ -#define nlmclt_decode_norep NULL - -#define PROC(proc, argtype, restype) \ -[NLMPROC_##proc] = { \ - .p_proc = NLMPROC_##proc, \ - .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \ - .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \ - .p_arglen = NLM_##argtype##_sz, \ - .p_replen = NLM_##restype##_sz, \ - .p_statidx = NLMPROC_##proc, \ - .p_name = #proc, \ - } - -static struct rpc_procinfo nlm_procedures[] = { - PROC(TEST, testargs, testres), - PROC(LOCK, lockargs, res), - PROC(CANCEL, cancargs, res), - PROC(UNLOCK, unlockargs, res), - PROC(GRANTED, testargs, res), - PROC(TEST_MSG, testargs, norep), - PROC(LOCK_MSG, lockargs, norep), - PROC(CANCEL_MSG, cancargs, norep), - PROC(UNLOCK_MSG, unlockargs, norep), - PROC(GRANTED_MSG, testargs, norep), - PROC(TEST_RES, testres, norep), - PROC(LOCK_RES, res, norep), - PROC(CANCEL_RES, res, norep), - PROC(UNLOCK_RES, res, norep), - PROC(GRANTED_RES, res, norep), -#ifdef NLMCLNT_SUPPORT_SHARES - PROC(SHARE, shareargs, shareres), - PROC(UNSHARE, shareargs, shareres), - PROC(NM_LOCK, lockargs, res), - PROC(FREE_ALL, notify, void), -#endif -}; - -static struct rpc_version nlm_version1 = { - .number = 1, - .nrprocs = 16, - .procs = nlm_procedures, -}; - -static struct rpc_version nlm_version3 = { - .number = 3, - .nrprocs = 24, - .procs = nlm_procedures, -}; - -static struct rpc_version * nlm_versions[] = { - [1] = &nlm_version1, - [3] = &nlm_version3, -#ifdef CONFIG_LOCKD_V4 - [4] = &nlm_version4, -#endif -}; - -static struct rpc_stat nlm_stats; - -struct rpc_program nlm_program = { - .name = "lockd", - .number = NLM_PROGRAM, - .nrvers = ARRAY_SIZE(nlm_versions), - .version = nlm_versions, - .stats = &nlm_stats, -}; - -#ifdef RPC_DEBUG -const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) -{ - /* - * We can get away with a static buffer because we're only - * called with BKL held. - */ - static char buf[2*NLM_MAXCOOKIELEN+1]; - unsigned int i, len = sizeof(buf); - char *p = buf; - - len--; /* allow for trailing \0 */ - if (len < 3) - return "???"; - for (i = 0 ; i < cookie->len ; i++) { - if (len < 2) { - strcpy(p-3, "..."); - break; - } - sprintf(p, "%02x", cookie->data[i]); - p += 2; - len -= 2; - } - *p = '\0'; - - return buf; -} -#endif diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index ad9dbbc9145..dfa4789cd46 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f) return p + XDR_QUADLEN(f->size); } -static __be32 * -nlm4_encode_fh(__be32 *p, struct nfs_fh *f) -{ - *p++ = htonl(f->size); - if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */ - memcpy(p, f->data, f->size); - return p + XDR_QUADLEN(f->size); -} - /* * Encode and decode owner handle */ @@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh) } static __be32 * -nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_encode_netobj(p, oh); -} - -static __be32 * nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) { struct file_lock *fl = &lock->fl; @@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) } /* - * Encode a lock as part of an NLM call - */ -static __be32 * -nlm4_encode_lock(__be32 *p, struct nlm_lock *lock) -{ - struct file_lock *fl = &lock->fl; - __s64 start, len; - - if (!(p = xdr_encode_string(p, lock->caller)) - || !(p = nlm4_encode_fh(p, &lock->fh)) - || !(p = nlm4_encode_oh(p, &lock->oh))) - return NULL; - - if (fl->fl_start > NLM4_OFFSET_MAX - || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX)) - return NULL; - - *p++ = htonl(lock->svid); - - start = loff_t_to_s64(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); - - p = xdr_encode_hyper(p, start); - p = xdr_encode_hyper(p, len); - - return p; -} - -/* * Encode result of a TEST/TEST_MSG call */ static __be32 * @@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) { return xdr_ressize_check(rqstp, p); } - -/* - * Now, the client side XDR functions - */ -#ifdef NLMCLNT_SUPPORT_SHARES -static int -nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr) -{ - return 0; -} -#endif - -static int -nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm4_encode_cookie(p, &argp->cookie))) - return -EIO; - *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero; - if (!(p = nlm4_encode_lock(p, lock))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm4_decode_cookie(p, &resp->cookie))) - return -EIO; - resp->status = *p++; - if (resp->status == nlm_lck_denied) { - struct file_lock *fl = &resp->lock.fl; - u32 excl; - __u64 start, len; - __s64 end; - - memset(&resp->lock, 0, sizeof(resp->lock)); - locks_init_lock(fl); - excl = ntohl(*p++); - resp->lock.svid = ntohl(*p++); - fl->fl_pid = (pid_t)resp->lock.svid; - if (!(p = nlm4_decode_oh(p, &resp->lock.oh))) - return -EIO; - - fl->fl_flags = FL_POSIX; - fl->fl_type = excl? F_WRLCK : F_RDLCK; - p = xdr_decode_hyper(p, &start); - p = xdr_decode_hyper(p, &len); - end = start + len - 1; - - fl->fl_start = s64_to_loff_t(start); - if (len == 0 || end < 0) - fl->fl_end = OFFSET_MAX; - else - fl->fl_end = s64_to_loff_t(end); - } - return 0; -} - - -static int -nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm4_encode_cookie(p, &argp->cookie))) - return -EIO; - *p++ = argp->block? xdr_one : xdr_zero; - *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero; - if (!(p = nlm4_encode_lock(p, lock))) - return -EIO; - *p++ = argp->reclaim? xdr_one : xdr_zero; - *p++ = htonl(argp->state); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm4_encode_cookie(p, &argp->cookie))) - return -EIO; - *p++ = argp->block? xdr_one : xdr_zero; - *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero; - if (!(p = nlm4_encode_lock(p, lock))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp) -{ - struct nlm_lock *lock = &argp->lock; - - if (!(p = nlm4_encode_cookie(p, &argp->cookie))) - return -EIO; - if (!(p = nlm4_encode_lock(p, lock))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) - return -EIO; - *p++ = resp->status; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm4_encode_testres(p, resp))) - return -EIO; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; -} - -static int -nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp) -{ - if (!(p = nlm4_decode_cookie(p, &resp->cookie))) - return -EIO; - resp->status = *p++; - return 0; -} - -#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) -# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!" -#endif - -#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN) -# error "NLM host name cannot be larger than NLM's maximum string length!" -#endif - -/* - * Buffer requirements for NLM - */ -#define NLM4_void_sz 0 -#define NLM4_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) -#define NLM4_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE) -#define NLM4_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE) -#define NLM4_fhandle_sz 1+XDR_QUADLEN(NFS3_FHSIZE) -#define NLM4_lock_sz 5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz -#define NLM4_holder_sz 6+NLM4_owner_sz - -#define NLM4_testargs_sz NLM4_cookie_sz+1+NLM4_lock_sz -#define NLM4_lockargs_sz NLM4_cookie_sz+4+NLM4_lock_sz -#define NLM4_cancargs_sz NLM4_cookie_sz+2+NLM4_lock_sz -#define NLM4_unlockargs_sz NLM4_cookie_sz+NLM4_lock_sz - -#define NLM4_testres_sz NLM4_cookie_sz+1+NLM4_holder_sz -#define NLM4_res_sz NLM4_cookie_sz+1 -#define NLM4_norep_sz 0 - -/* - * For NLM, a void procedure really returns nothing - */ -#define nlm4clt_decode_norep NULL - -#define PROC(proc, argtype, restype) \ -[NLMPROC_##proc] = { \ - .p_proc = NLMPROC_##proc, \ - .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \ - .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \ - .p_arglen = NLM4_##argtype##_sz, \ - .p_replen = NLM4_##restype##_sz, \ - .p_statidx = NLMPROC_##proc, \ - .p_name = #proc, \ - } - -static struct rpc_procinfo nlm4_procedures[] = { - PROC(TEST, testargs, testres), - PROC(LOCK, lockargs, res), - PROC(CANCEL, cancargs, res), - PROC(UNLOCK, unlockargs, res), - PROC(GRANTED, testargs, res), - PROC(TEST_MSG, testargs, norep), - PROC(LOCK_MSG, lockargs, norep), - PROC(CANCEL_MSG, cancargs, norep), - PROC(UNLOCK_MSG, unlockargs, norep), - PROC(GRANTED_MSG, testargs, norep), - PROC(TEST_RES, testres, norep), - PROC(LOCK_RES, res, norep), - PROC(CANCEL_RES, res, norep), - PROC(UNLOCK_RES, res, norep), - PROC(GRANTED_RES, res, norep), -#ifdef NLMCLNT_SUPPORT_SHARES - PROC(SHARE, shareargs, shareres), - PROC(UNSHARE, shareargs, shareres), - PROC(NM_LOCK, lockargs, res), - PROC(FREE_ALL, notify, void), -#endif -}; - -struct rpc_version nlm_version4 = { - .number = 4, - .nrprocs = 24, - .procs = nlm4_procedures, -}; diff --git a/fs/locks.c b/fs/locks.c index 8729347bcd1..0f3998291f7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl) fl->fl_file->f_owner.signum = 0; } -static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try) -{ - return fl->fl_file == try->fl_file; -} - static const struct lock_manager_operations lease_manager_ops = { .fl_break = lease_break_callback, .fl_release_private = lease_release_private_callback, - .fl_mylease = lease_mylease_callback, .fl_change = lease_modify, }; @@ -1389,7 +1383,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; if ((arg == F_WRLCK) - && ((atomic_read(&dentry->d_count) > 1) + && ((dentry->d_count > 1) || (atomic_read(&inode->i_count) > 1))) goto out; } @@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (lease->fl_lmops->fl_mylease(fl, lease)) + if (fl->fl_file == filp) my_before = before; else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) /* diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 92ca6fbe09b..723bc5bca09 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page) static void bdev_put_device(struct logfs_super *s) { - close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int bdev_can_write_buf(struct super_block *sb, u64 ofs) @@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, { struct block_device *bdev; - bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + type); if (IS_ERR(bdev)) return PTR_ERR(bdev); if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { int mtdnr = MINOR(bdev->bd_dev); - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); return logfs_get_sb_mtd(p, mtdnr); } diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 409dfd65e9a..f9ddf0c388c 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask) +static int logfs_permission(struct inode *inode, int mask, unsigned int flags) { - return generic_permission(inode, mask, NULL); + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + return generic_permission(inode, mask, flags, NULL); } static int logfs_link(struct dentry *old_dentry, struct inode *dir, diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index d8c71ece098..03b8c240aed 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) return __logfs_iget(sb, ino); } +static void logfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); +} + static void __logfs_destroy_inode(struct inode *inode) { struct logfs_inode *li = logfs_inode(inode); BUG_ON(li->li_block); list_del(&li->li_freeing_list); - kmem_cache_free(logfs_inode_cache, li); + call_rcu(&inode->i_rcu, logfs_i_callback); } static void logfs_destroy_inode(struct inode *inode) diff --git a/fs/mbcache.c b/fs/mbcache.c index 93444747237..a25444ab2ba 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first); EXPORT_SYMBOL(mb_cache_entry_find_next); #endif -struct mb_cache { - struct list_head c_cache_list; - const char *c_name; - atomic_t c_entry_count; - int c_max_entries; - int c_bucket_bits; - struct kmem_cache *c_entry_cache; - struct list_head *c_block_hash; - struct list_head *c_index_hash; -}; - - /* * Global data: list of all mbcache's, lru list, and a spinlock for * accessing cache data structures on SMP machines. The lru list is diff --git a/fs/minix/inode.c b/fs/minix/inode.c index fb2020858a3..ae0b83f476a 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void minix_destroy_inode(struct inode *inode) +static void minix_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(minix_inode_cachep, minix_i(inode)); } +static void minix_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, minix_i_callback); +} + static void init_once(void *foo) { struct minix_inode_info *ei = (struct minix_inode_info *) foo; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index c0d35a3acce..ce7337ddfdb 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st struct inode * inode = NULL; ino_t ino; - dentry->d_op = dir->i_sb->s_root->d_op; - if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) return ERR_PTR(-ENAMETOOLONG); diff --git a/fs/mpage.c b/fs/mpage.c index fd56ca2ea55..d78455a81ec 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -40,7 +40,7 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io_read(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); -} - -static void mpage_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (!uptodate){ - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + if (bio_data_dir(bio) == READ) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* bio_data_dir(bio) == WRITE */ + if (!uptodate) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + end_page_writeback(page); } - end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); } static struct bio *mpage_bio_submit(int rw, struct bio *bio) { - bio->bi_end_io = mpage_end_io_read; - if (rw == WRITE) - bio->bi_end_io = mpage_end_io_write; + bio->bi_end_io = mpage_end_io; submit_bio(rw, bio); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index 4ff7ca53053..7d77f24d32a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname); /* * This does basic POSIX ACL permission checking */ -static int acl_permission_check(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask)) +static int acl_permission_check(struct inode *inode, int mask, unsigned int flags, + int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) { umode_t mode = inode->i_mode; @@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask, mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { - int error = check_acl(inode, mask); + int error = check_acl(inode, mask, flags); if (error != -EAGAIN) return error; } @@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask, } /** - * generic_permission - check for access rights on a Posix-like filesystem + * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @check_acl: optional callback to check for Posix ACLs + * @flags: IPERM_FLAG_ flags. * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which - * are used for other things.. + * are used for other things. + * + * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk + * request cannot be satisfied (eg. requires blocking or too much complexity). + * It would then be called again in ref-walk mode. */ -int generic_permission(struct inode *inode, int mask, - int (*check_acl)(struct inode *inode, int mask)) +int generic_permission(struct inode *inode, int mask, unsigned int flags, + int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) { int ret; /* * Do the basic POSIX ACL permission checks. */ - ret = acl_permission_check(inode, mask, check_acl); + ret = acl_permission_check(inode, mask, flags, check_acl); if (ret != -EACCES) return ret; @@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask) } if (inode->i_op->permission) - retval = inode->i_op->permission(inode, mask); + retval = inode->i_op->permission(inode, mask, 0); else - retval = generic_permission(inode, mask, inode->i_op->check_acl); + retval = generic_permission(inode, mask, 0, + inode->i_op->check_acl); if (retval) return retval; @@ -375,6 +381,181 @@ void path_put(struct path *path) EXPORT_SYMBOL(path_put); /** + * nameidata_drop_rcu - drop this nameidata out of rcu-walk + * @nd: nameidata pathwalk data to drop + * Returns: 0 on success, -ECHILD on failure + * + * Path walking has 2 modes, rcu-walk and ref-walk (see + * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt + * to drop out of rcu-walk mode and take normal reference counts on dentries + * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take + * refcounts at the last known good point before rcu-walk got stuck, so + * ref-walk may continue from there. If this is not successful (eg. a seqcount + * has changed), then failure is returned and path walk restarts from the + * beginning in ref-walk mode. + * + * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into + * ref-walk. Must be called from rcu-walk context. + */ +static int nameidata_drop_rcu(struct nameidata *nd) +{ + struct fs_struct *fs = current->fs; + struct dentry *dentry = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (nd->root.mnt) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || + nd->root.dentry != fs->root.dentry) + goto err_root; + } + spin_lock(&dentry->d_lock); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err; + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + if (nd->root.mnt) { + path_get(&nd->root); + spin_unlock(&fs->lock); + } + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + nd->flags &= ~LOOKUP_RCU; + return 0; +err: + spin_unlock(&dentry->d_lock); +err_root: + if (nd->root.mnt) + spin_unlock(&fs->lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_drop_rcu_maybe(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) + return nameidata_drop_rcu(nd); + return 0; +} + +/** + * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk + * @nd: nameidata pathwalk data to drop + * @dentry: dentry to drop + * Returns: 0 on success, -ECHILD on failure + * + * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, + * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on + * @nd. Must be called from rcu-walk context. + */ +static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) +{ + struct fs_struct *fs = current->fs; + struct dentry *parent = nd->path.dentry; + + /* + * It can be possible to revalidate the dentry that we started + * the path walk with. force_reval_path may also revalidate the + * dentry already committed to the nameidata. + */ + if (unlikely(parent == dentry)) + return nameidata_drop_rcu(nd); + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (nd->root.mnt) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || + nd->root.dentry != fs->root.dentry) + goto err_root; + } + spin_lock(&parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err; + /* + * If the sequence check on the child dentry passed, then the child has + * not been removed from its parent. This means the parent dentry must + * be valid and able to take a reference at this point. + */ + BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); + BUG_ON(!parent->d_count); + parent->d_count++; + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); + if (nd->root.mnt) { + path_get(&nd->root); + spin_unlock(&fs->lock); + } + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + nd->flags &= ~LOOKUP_RCU; + return 0; +err: + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); +err_root: + if (nd->root.mnt) + spin_unlock(&fs->lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) +{ + if (nd->flags & LOOKUP_RCU) + return nameidata_dentry_drop_rcu(nd, dentry); + return 0; +} + +/** + * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk + * @nd: nameidata pathwalk data to drop + * Returns: 0 on success, -ECHILD on failure + * + * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. + * nd->path should be the final element of the lookup, so nd->root is discarded. + * Must be called from rcu-walk context. + */ +static int nameidata_drop_rcu_last(struct nameidata *nd) +{ + struct dentry *dentry = nd->path.dentry; + + BUG_ON(!(nd->flags & LOOKUP_RCU)); + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + spin_lock(&dentry->d_lock); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err_unlock; + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + + mntget(nd->path.mnt); + + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + + return 0; + +err_unlock: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; +} + +/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ +static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) +{ + if (likely(nd->flags & LOOKUP_RCU)) + return nameidata_drop_rcu_last(nd); + return 0; +} + +/** * release_open_intent - free up open intent resources * @nd: pointer to nameidata */ @@ -386,10 +567,33 @@ void release_open_intent(struct nameidata *nd) fput(nd->intent.open.file); } +/* + * Call d_revalidate and handle filesystems that request rcu-walk + * to be dropped. This may be called and return in rcu-walk mode, + * regardless of success or error. If -ECHILD is returned, the caller + * must return -ECHILD back up the path walk stack so path walk may + * be restarted in ref-walk mode. + */ +static int d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + int status; + + status = dentry->d_op->d_revalidate(dentry, nd); + if (status == -ECHILD) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return status; + status = dentry->d_op->d_revalidate(dentry, nd); + } + + return status; +} + static inline struct dentry * do_revalidate(struct dentry *dentry, struct nameidata *nd) { - int status = dentry->d_op->d_revalidate(dentry, nd); + int status; + + status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { /* * The dentry failed validation. @@ -397,19 +601,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) * the dentry otherwise d_revalidate is asking us * to return a fail status. */ - if (!status) { + if (status < 0) { + /* If we're in rcu-walk, we don't have a ref */ + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = ERR_PTR(status); + + } else { + /* Don't d_invalidate in rcu-walk mode */ + if (nameidata_dentry_drop_rcu_maybe(nd, dentry)) + return ERR_PTR(-ECHILD); if (!d_invalidate(dentry)) { dput(dentry); dentry = NULL; } - } else { - dput(dentry); - dentry = ERR_PTR(status); } } return dentry; } +static inline int need_reval_dot(struct dentry *dentry) +{ + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) + return 0; + + return 1; +} + /* * force_reval_path - force revalidation of a dentry * @@ -433,17 +654,19 @@ force_reval_path(struct path *path, struct nameidata *nd) /* * only check on filesystems where it's possible for the dentry to - * become stale. It's assumed that if this flag is set then the - * d_revalidate op will also be defined. + * become stale. */ - if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) + if (!need_reval_dot(dentry)) return 0; - status = dentry->d_op->d_revalidate(dentry, nd); + status = d_revalidate(dentry, nd); if (status > 0) return 0; if (!status) { + /* Don't d_invalidate in rcu-walk mode */ + if (nameidata_drop_rcu(nd)) + return -ECHILD; d_invalidate(dentry); status = -ESTALE; } @@ -459,26 +682,27 @@ force_reval_path(struct path *path, struct nameidata *nd) * short-cut DAC fails, then call ->permission() to do more * complete permission check. */ -static int exec_permission(struct inode *inode) +static inline int exec_permission(struct inode *inode, unsigned int flags) { int ret; if (inode->i_op->permission) { - ret = inode->i_op->permission(inode, MAY_EXEC); - if (!ret) - goto ok; - return ret; + ret = inode->i_op->permission(inode, MAY_EXEC, flags); + } else { + ret = acl_permission_check(inode, MAY_EXEC, flags, + inode->i_op->check_acl); } - ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); - if (!ret) + if (likely(!ret)) goto ok; + if (ret == -ECHILD) + return ret; if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; return ret; ok: - return security_inode_permission(inode, MAY_EXEC); + return security_inode_exec_permission(inode, flags); } static __always_inline void set_root(struct nameidata *nd) @@ -489,8 +713,23 @@ static __always_inline void set_root(struct nameidata *nd) static int link_path_walk(const char *, struct nameidata *); +static __always_inline void set_root_rcu(struct nameidata *nd) +{ + if (!nd->root.mnt) { + struct fs_struct *fs = current->fs; + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); + } +} + static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { + int ret; + if (IS_ERR(link)) goto fail; @@ -500,8 +739,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l nd->path = nd->root; path_get(&nd->root); } + nd->inode = nd->path.dentry->d_inode; - return link_path_walk(link, nd); + ret = link_path_walk(link, nd); + return ret; fail: path_put(&nd->path); return PTR_ERR(link); @@ -514,30 +755,30 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) mntput(path->mnt); } -static inline void path_to_nameidata(struct path *path, struct nameidata *nd) +static inline void path_to_nameidata(const struct path *path, + struct nameidata *nd) { - dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) { - mntput(nd->path.mnt); - nd->path.mnt = path->mnt; + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path->mnt) + mntput(nd->path.mnt); } + nd->path.mnt = path->mnt; nd->path.dentry = path->dentry; } static __always_inline int -__do_follow_link(struct path *path, struct nameidata *nd, void **p) +__do_follow_link(const struct path *link, struct nameidata *nd, void **p) { int error; - struct dentry *dentry = path->dentry; + struct dentry *dentry = link->dentry; - touch_atime(path->mnt, dentry); + touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); - if (path->mnt != nd->path.mnt) { - path_to_nameidata(path, nd); - dget(dentry); - } - mntget(path->mnt); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); @@ -591,6 +832,20 @@ loop: return err; } +static int follow_up_rcu(struct path *path) +{ + struct vfsmount *parent; + struct dentry *mountpoint; + + parent = path->mnt->mnt_parent; + if (parent == path->mnt) + return 0; + mountpoint = path->mnt->mnt_mountpoint; + path->dentry = mountpoint; + path->mnt = parent; + return 1; +} + int follow_up(struct path *path) { struct vfsmount *parent; @@ -612,58 +867,295 @@ int follow_up(struct path *path) return 1; } -/* no need for dcache_lock, as serialization is taken care in - * namespace.c +/* + * Perform an automount + * - return -EISDIR to tell follow_managed() to stop and return the path we + * were called with. */ -static int __follow_mount(struct path *path) +static int follow_automount(struct path *path, unsigned flags, + bool *need_mntput) { - int res = 0; - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; + struct vfsmount *mnt; + int err; + + if (!path->dentry->d_op || !path->dentry->d_op->d_automount) + return -EREMOTE; + + /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT + * and this is the terminal part of the path. + */ + if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE)) + return -EISDIR; /* we actually want to stop here */ + + /* We want to mount if someone is trying to open/create a file of any + * type under the mountpoint, wants to traverse through the mountpoint + * or wants to open the mounted directory. + * + * We don't want to mount if someone's just doing a stat and they've + * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and + * appended a '/' to the name. + */ + if (!(flags & LOOKUP_FOLLOW) && + !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE))) + return -EISDIR; + + current->total_link_count++; + if (current->total_link_count >= 40) + return -ELOOP; + + mnt = path->dentry->d_op->d_automount(path); + if (IS_ERR(mnt)) { + /* + * The filesystem is allowed to return -EISDIR here to indicate + * it doesn't want to automount. For instance, autofs would do + * this so that its userspace daemon can mount on this dentry. + * + * However, we can only permit this if it's a terminal point in + * the path being looked up; if it wasn't then the remainder of + * the path is inaccessible and we should say so. + */ + if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE)) + return -EREMOTE; + return PTR_ERR(mnt); + } + + if (!mnt) /* mount collision */ + return 0; + + err = finish_automount(mnt, path); + + switch (err) { + case -EBUSY: + /* Someone else made a mount here whilst we were busy */ + return 0; + case 0: dput(path->dentry); - if (res) + if (*need_mntput) mntput(path->mnt); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + *need_mntput = true; + return 0; + default: + return err; + } + +} + +/* + * Handle a dentry that is managed in some way. + * - Flagged for transit management (autofs) + * - Flagged as mountpoint + * - Flagged as automount point + * + * This may only be called in refwalk mode. + * + * Serialization is taken care of in namespace.c + */ +static int follow_managed(struct path *path, unsigned flags) +{ + unsigned managed; + bool need_mntput = false; + int ret; + + /* Given that we're not holding a lock here, we retain the value in a + * local variable for each dentry as we look at it so that we don't see + * the components of that value change under us */ + while (managed = ACCESS_ONCE(path->dentry->d_flags), + managed &= DCACHE_MANAGED_DENTRY, + unlikely(managed != 0)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage(path->dentry, + false, false); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + if (need_mntput) + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + need_mntput = true; + continue; + } + + /* Something is mounted on this dentry in another + * namespace and/or whatever was mounted there in this + * namespace got unmounted before we managed to get the + * vfsmount_lock */ + } + + /* Handle an automount point */ + if (managed & DCACHE_NEED_AUTOMOUNT) { + ret = follow_automount(path, flags, &need_mntput); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + continue; + } + + /* We didn't change the current path point */ + break; + } + return 0; +} + +int follow_down_one(struct path *path) +{ + struct vfsmount *mounted; + + mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); - res = 1; + return 1; } - return res; + return 0; } -static void follow_mount(struct path *path) +/* + * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we + * meet a managed dentry and we're not walking to "..". True is returned to + * continue, false to abort. + */ +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, + struct inode **inode, bool reverse_transit) { while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); + struct vfsmount *mounted; + if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && + !reverse_transit && + path->dentry->d_op->d_manage(path->dentry, false, true) < 0) + return false; + mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) break; - dput(path->dentry); - mntput(path->mnt); path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); + path->dentry = mounted->mnt_root; + nd->seq = read_seqcount_begin(&path->dentry->d_seq); + *inode = path->dentry->d_inode; } + + if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + return reverse_transit; + return true; } -/* no need for dcache_lock, as serialization is taken care in - * namespace.c +static int follow_dotdot_rcu(struct nameidata *nd) +{ + struct inode *inode = nd->inode; + + set_root_rcu(nd); + + while (1) { + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { + break; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + struct dentry *old = nd->path.dentry; + struct dentry *parent = old->d_parent; + unsigned seq; + + seq = read_seqcount_begin(&parent->d_seq); + if (read_seqcount_retry(&old->d_seq, nd->seq)) + return -ECHILD; + inode = parent->d_inode; + nd->path.dentry = parent; + nd->seq = seq; + break; + } + if (!follow_up_rcu(&nd->path)) + break; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + inode = nd->path.dentry->d_inode; + } + __follow_mount_rcu(nd, &nd->path, &inode, true); + nd->inode = inode; + + return 0; +} + +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + * + * Care must be taken as namespace_sem may be held (indicated by mounting_here + * being true). */ -int follow_down(struct path *path) +int follow_down(struct path *path, bool mounting_here) { - struct vfsmount *mounted; + unsigned managed; + int ret; - mounted = lookup_mnt(path); - if (mounted) { + while (managed = ACCESS_ONCE(path->dentry->d_flags), + unlikely(managed & DCACHE_MANAGED_DENTRY)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. + * + * We indicate to the filesystem if someone is trying to mount + * something here. This gives autofs the chance to deny anyone + * other than its daemon the right to mount on its + * superstructure. + * + * The filesystem may sleep at this point. + */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage( + path->dentry, mounting_here, false); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + continue; + } + + /* Don't handle automount points here */ + break; + } + return 0; +} + +/* + * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() + */ +static void follow_mount(struct path *path) +{ + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); - return 1; } - return 0; } -static __always_inline void follow_dotdot(struct nameidata *nd) +static void follow_dotdot(struct nameidata *nd) { set_root(nd); @@ -684,6 +1176,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd) break; } follow_mount(&nd->path); + nd->inode = nd->path.dentry->d_inode; } /* @@ -721,17 +1214,19 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent, * It _is_ time-critical. */ static int do_lookup(struct nameidata *nd, struct qstr *name, - struct path *path) + struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; - struct dentry *dentry, *parent; + struct dentry *dentry, *parent = nd->path.dentry; struct inode *dir; + int err; + /* * See if the low-level filesystem might want * to use its own hash.. */ - if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name); + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, name); if (err < 0) return err; } @@ -741,21 +1236,52 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ - dentry = __d_lookup(nd->path.dentry, name); + if (nd->flags & LOOKUP_RCU) { + unsigned seq; + + *inode = nd->inode; + dentry = __d_lookup_rcu(parent, name, &seq, inode); + if (!dentry) { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + goto need_lookup; + } + /* Memory barrier in read_seqcount_begin of child is enough */ + if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + return -ECHILD; + + nd->seq = seq; + if (dentry->d_flags & DCACHE_OP_REVALIDATE) + goto need_revalidate; +done2: + path->mnt = mnt; + path->dentry = dentry; + if (likely(__follow_mount_rcu(nd, path, inode, false))) + return 0; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + /* fallthru */ + } + dentry = __d_lookup(parent, name); if (!dentry) goto need_lookup; found: - if (dentry->d_op && dentry->d_op->d_revalidate) + if (dentry->d_flags & DCACHE_OP_REVALIDATE) goto need_revalidate; done: path->mnt = mnt; path->dentry = dentry; - __follow_mount(path); + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + *inode = path->dentry->d_inode; return 0; need_lookup: - parent = nd->path.dentry; dir = parent->d_inode; + BUG_ON(nd->inode != dir); mutex_lock(&dir->i_mutex); /* @@ -789,6 +1315,8 @@ need_revalidate: goto need_lookup; if (IS_ERR(dentry)) goto fail; + if (nd->flags & LOOKUP_RCU) + goto done2; goto done; fail: @@ -796,17 +1324,6 @@ fail: } /* - * This is a temporary kludge to deal with "automount" symlinks; proper - * solution is to trigger them on follow_mount(), so that do_lookup() - * would DTRT. To be killed before 2.6.34-final. - */ -static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) -{ - return inode && unlikely(inode->i_op->follow_link) && - ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); -} - -/* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. @@ -817,7 +1334,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) static int link_path_walk(const char *name, struct nameidata *nd) { struct path next; - struct inode *inode; int err; unsigned int lookup_flags = nd->flags; @@ -826,18 +1342,28 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (!*name) goto return_reval; - inode = nd->path.dentry->d_inode; if (nd->depth) lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); /* At this point we know we have a real path component. */ for(;;) { + struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; nd->flags |= LOOKUP_CONTINUE; - err = exec_permission(inode); + if (nd->flags & LOOKUP_RCU) { + err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err == -ECHILD) { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + goto exec_again; + } + } else { +exec_again: + err = exec_permission(nd->inode, 0); + } if (err) break; @@ -868,37 +1394,44 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (this.name[0] == '.') switch (this.len) { default: break; - case 2: + case 2: if (this.name[1] != '.') break; - follow_dotdot(nd); - inode = nd->path.dentry->d_inode; + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); /* fallthrough */ case 1: continue; } /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next); + err = do_lookup(nd, &this, &next, &inode); if (err) break; - err = -ENOENT; - inode = next.dentry->d_inode; if (!inode) goto out_dput; if (inode->i_op->follow_link) { + /* We commonly drop rcu-walk here */ + if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) + return -ECHILD; + BUG_ON(inode != next.dentry->d_inode); err = do_follow_link(&next, nd); if (err) goto return_err; + nd->inode = nd->path.dentry->d_inode; err = -ENOENT; - inode = nd->path.dentry->d_inode; - if (!inode) + if (!nd->inode) break; - } else + } else { path_to_nameidata(&next, nd); + nd->inode = inode; + } err = -ENOTDIR; - if (!inode->i_op->lookup) + if (!nd->inode->i_op->lookup) break; continue; /* here ends the main loop */ @@ -913,32 +1446,40 @@ last_component: if (this.name[0] == '.') switch (this.len) { default: break; - case 2: + case 2: if (this.name[1] != '.') break; - follow_dotdot(nd); - inode = nd->path.dentry->d_inode; + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); /* fallthrough */ case 1: goto return_reval; } - err = do_lookup(nd, &this, &next); + err = do_lookup(nd, &this, &next, &inode); if (err) break; - inode = next.dentry->d_inode; - if (follow_on_final(inode, lookup_flags)) { + if (inode && unlikely(inode->i_op->follow_link) && + (lookup_flags & LOOKUP_FOLLOW)) { + if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) + return -ECHILD; + BUG_ON(inode != next.dentry->d_inode); err = do_follow_link(&next, nd); if (err) goto return_err; - inode = nd->path.dentry->d_inode; - } else + nd->inode = nd->path.dentry->d_inode; + } else { path_to_nameidata(&next, nd); + nd->inode = inode; + } err = -ENOENT; - if (!inode) + if (!nd->inode) break; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; - if (!inode->i_op->lookup) + if (!nd->inode->i_op->lookup) break; } goto return_base; @@ -958,25 +1499,43 @@ return_reval: * We bypassed the ordinary revalidation routines. * We may need to check the cached dentry for staleness. */ - if (nd->path.dentry && nd->path.dentry->d_sb && - (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { - err = -ESTALE; + if (need_reval_dot(nd->path.dentry)) { /* Note: we do not d_invalidate() */ - if (!nd->path.dentry->d_op->d_revalidate( - nd->path.dentry, nd)) + err = d_revalidate(nd->path.dentry, nd); + if (!err) + err = -ESTALE; + if (err < 0) break; } return_base: + if (nameidata_drop_rcu_last_maybe(nd)) + return -ECHILD; return 0; out_dput: - path_put_conditional(&next, nd); + if (!(nd->flags & LOOKUP_RCU)) + path_put_conditional(&next, nd); break; } - path_put(&nd->path); + if (!(nd->flags & LOOKUP_RCU)) + path_put(&nd->path); return_err: return err; } +static inline int path_walk_rcu(const char *name, struct nameidata *nd) +{ + current->total_link_count = 0; + + return link_path_walk(name, nd); +} + +static inline int path_walk_simple(const char *name, struct nameidata *nd) +{ + current->total_link_count = 0; + + return link_path_walk(name, nd); +} + static int path_walk(const char *name, struct nameidata *nd) { struct path save = nd->path; @@ -1002,6 +1561,93 @@ static int path_walk(const char *name, struct nameidata *nd) return result; } +static void path_finish_rcu(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + /* RCU dangling. Cancel it. */ + nd->flags &= ~LOOKUP_RCU; + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } + if (nd->file) + fput(nd->file); +} + +static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +{ + int retval = 0; + int fput_needed; + struct file *file; + + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags | LOOKUP_RCU; + nd->depth = 0; + nd->root.mnt = NULL; + nd->file = NULL; + + if (*name=='/') { + struct fs_struct *fs = current->fs; + unsigned seq; + + br_read_lock(vfsmount_lock); + rcu_read_lock(); + + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + nd->path = nd->root; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + + } else if (dfd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + unsigned seq; + + br_read_lock(vfsmount_lock); + rcu_read_lock(); + + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + + } else { + struct dentry *dentry; + + file = fget_light(dfd, &fput_needed); + retval = -EBADF; + if (!file) + goto out_fail; + + dentry = file->f_path.dentry; + + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; + + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + + nd->path = file->f_path; + if (fput_needed) + nd->file = file; + + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } + nd->inode = nd->path.dentry->d_inode; + return 0; + +fput_fail: + fput_light(file, fput_needed); +out_fail: + return retval; +} + static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; @@ -1042,6 +1688,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei fput_light(file, fput_needed); } + nd->inode = nd->path.dentry->d_inode; return 0; fput_fail: @@ -1054,16 +1701,53 @@ out_fail: static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - int retval = path_init(dfd, name, flags, nd); - if (!retval) - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->path.dentry->d_inode)) - audit_inode(name, nd->path.dentry); + int retval; + + /* + * Path walking is largely split up into 2 different synchronisation + * schemes, rcu-walk and ref-walk (explained in + * Documentation/filesystems/path-lookup.txt). These share much of the + * path walk code, but some things particularly setup, cleanup, and + * following mounts are sufficiently divergent that functions are + * duplicated. Typically there is a function foo(), and its RCU + * analogue, foo_rcu(). + * + * -ECHILD is the error number of choice (just to avoid clashes) that + * is returned if some aspect of an rcu-walk fails. Such an error must + * be handled by restarting a traditional ref-walk (which will always + * be able to complete). + */ + retval = path_init_rcu(dfd, name, flags, nd); + if (unlikely(retval)) + return retval; + retval = path_walk_rcu(name, nd); + path_finish_rcu(nd); if (nd->root.mnt) { path_put(&nd->root); nd->root.mnt = NULL; } + + if (unlikely(retval == -ECHILD || retval == -ESTALE)) { + /* slower, locked walk */ + if (retval == -ESTALE) + flags |= LOOKUP_REVAL; + retval = path_init(dfd, name, flags, nd); + if (unlikely(retval)) + return retval; + retval = path_walk(name, nd); + if (nd->root.mnt) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + } + + if (likely(!retval)) { + if (unlikely(!audit_dummy_context())) { + if (nd->path.dentry && nd->inode) + audit_inode(name, nd->path.dentry); + } + } + return retval; } @@ -1106,10 +1790,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, path_get(&nd->path); nd->root = nd->path; path_get(&nd->root); + nd->inode = nd->path.dentry->d_inode; retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->path.dentry->d_inode)) + nd->inode)) audit_inode(name, nd->path.dentry); path_put(&nd->root); @@ -1125,7 +1810,7 @@ static struct dentry *__lookup_hash(struct qstr *name, struct dentry *dentry; int err; - err = exec_permission(inode); + err = exec_permission(inode, 0); if (err) return ERR_PTR(err); @@ -1133,8 +1818,8 @@ static struct dentry *__lookup_hash(struct qstr *name, * See if the low-level filesystem might want * to use its own hash.. */ - if (base->d_op && base->d_op->d_hash) { - err = base->d_op->d_hash(base, name); + if (base->d_flags & DCACHE_OP_HASH) { + err = base->d_op->d_hash(base, inode, name); dentry = ERR_PTR(err); if (err < 0) goto out; @@ -1147,7 +1832,7 @@ static struct dentry *__lookup_hash(struct qstr *name, */ dentry = d_lookup(base, name); - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) + if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) dentry = do_revalidate(dentry, nd); if (!dentry) @@ -1448,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag) return break_lease(inode, flag); } -static int handle_truncate(struct path *path) +static int handle_truncate(struct file *filp) { + struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) @@ -1463,7 +2149,7 @@ static int handle_truncate(struct path *path) if (!error) { error = do_truncate(path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, - NULL); + filp); } put_write_access(inode); return error; @@ -1490,6 +2176,7 @@ out_unlock: mutex_unlock(&dir->d_inode->i_mutex); dput(nd->path.dentry); nd->path.dentry = path->dentry; + if (error) return error; /* Don't check for write permission, don't truncate */ @@ -1560,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd, } if (!IS_ERR(filp)) { if (will_truncate) { - error = handle_truncate(&nd->path); + error = handle_truncate(filp); if (error) { fput(filp); filp = ERR_PTR(error); @@ -1584,6 +2271,9 @@ exit: return ERR_PTR(error); } +/* + * Handle O_CREAT case for do_filp_open + */ static struct file *do_last(struct nameidata *nd, struct path *path, int open_flag, int acc_mode, int mode, const char *pathname) @@ -1597,50 +2287,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path, follow_dotdot(nd); dir = nd->path.dentry; case LAST_DOT: - if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { - if (!dir->d_op->d_revalidate(dir, nd)) { - error = -ESTALE; + if (need_reval_dot(dir)) { + int status = d_revalidate(nd->path.dentry, nd); + if (!status) + status = -ESTALE; + if (status < 0) { + error = status; goto exit; } } /* fallthrough */ case LAST_ROOT: - if (open_flag & O_CREAT) - goto exit; - /* fallthrough */ + goto exit; case LAST_BIND: audit_inode(pathname, dir); goto ok; } /* trailing slashes? */ - if (nd->last.name[nd->last.len]) { - if (open_flag & O_CREAT) - goto exit; - nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW; - } - - /* just plain open? */ - if (!(open_flag & O_CREAT)) { - error = do_lookup(nd, &nd->last, path); - if (error) - goto exit; - error = -ENOENT; - if (!path->dentry->d_inode) - goto exit_dput; - if (path->dentry->d_inode->i_op->follow_link) - return NULL; - error = -ENOTDIR; - if (nd->flags & LOOKUP_DIRECTORY) { - if (!path->dentry->d_inode->i_op->lookup) - goto exit_dput; - } - path_to_nameidata(path, nd); - audit_inode(pathname, nd->path.dentry); - goto ok; - } + if (nd->last.name[nd->last.len]) + goto exit; - /* OK, it's O_CREAT */ mutex_lock(&dir->d_inode->i_mutex); path->dentry = lookup_hash(nd); @@ -1697,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (open_flag & O_EXCL) goto exit_dput; - if (__follow_mount(path)) { - error = -ELOOP; - if (open_flag & O_NOFOLLOW) - goto exit_dput; - } + error = follow_managed(path, nd->flags); + if (error < 0) + goto exit_dput; error = -ENOENT; if (!path->dentry->d_inode) @@ -1711,8 +2376,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, return NULL; path_to_nameidata(path, nd); + nd->inode = path->dentry->d_inode; error = -EISDIR; - if (S_ISDIR(path->dentry->d_inode->i_mode)) + if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: filp = finish_open(nd, open_flag, acc_mode); @@ -1743,7 +2409,7 @@ struct file *do_filp_open(int dfd, const char *pathname, struct path path; int count = 0; int flag = open_to_namei_flags(open_flag); - int force_reval = 0; + int flags; if (!(open_flag & O_CREAT)) mode = 0; @@ -1772,54 +2438,84 @@ struct file *do_filp_open(int dfd, const char *pathname, if (open_flag & O_APPEND) acc_mode |= MAY_APPEND; - /* find the parent */ -reval: - error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); + flags = LOOKUP_OPEN; + if (open_flag & O_CREAT) { + flags |= LOOKUP_CREATE; + if (open_flag & O_EXCL) + flags |= LOOKUP_EXCL; + } + if (open_flag & O_DIRECTORY) + flags |= LOOKUP_DIRECTORY; + if (!(open_flag & O_NOFOLLOW)) + flags |= LOOKUP_FOLLOW; + + filp = get_empty_filp(); + if (!filp) + return ERR_PTR(-ENFILE); + + filp->f_flags = open_flag; + nd.intent.open.file = filp; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = mode; + + if (open_flag & O_CREAT) + goto creat; + + /* !O_CREAT, simple open */ + error = do_path_lookup(dfd, pathname, flags, &nd); + if (unlikely(error)) + goto out_filp; + error = -ELOOP; + if (!(nd.flags & LOOKUP_FOLLOW)) { + if (nd.inode->i_op->follow_link) + goto out_path; + } + error = -ENOTDIR; + if (nd.flags & LOOKUP_DIRECTORY) { + if (!nd.inode->i_op->lookup) + goto out_path; + } + audit_inode(pathname, nd.path.dentry); + filp = finish_open(&nd, open_flag, acc_mode); + return filp; + +creat: + /* OK, have to create the file. Find the parent. */ + error = path_init_rcu(dfd, pathname, + LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); if (error) - return ERR_PTR(error); - if (force_reval) - nd.flags |= LOOKUP_REVAL; + goto out_filp; + error = path_walk_rcu(pathname, &nd); + path_finish_rcu(&nd); + if (unlikely(error == -ECHILD || error == -ESTALE)) { + /* slower, locked walk */ + if (error == -ESTALE) { +reval: + flags |= LOOKUP_REVAL; + } + error = path_init(dfd, pathname, + LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); + if (error) + goto out_filp; - current->total_link_count = 0; - error = link_path_walk(pathname, &nd); - if (error) { - filp = ERR_PTR(error); - goto out; + error = path_walk_simple(pathname, &nd); } - if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) + if (unlikely(error)) + goto out_filp; + if (unlikely(!audit_dummy_context())) audit_inode(pathname, nd.path.dentry); /* * We have the parent and last component. */ - - error = -ENFILE; - filp = get_empty_filp(); - if (filp == NULL) - goto exit_parent; - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_OPEN; - if (open_flag & O_CREAT) { - nd.flags |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - nd.flags |= LOOKUP_EXCL; - } - if (open_flag & O_DIRECTORY) - nd.flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - nd.flags |= LOOKUP_FOLLOW; + nd.flags = flags; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); while (unlikely(!filp)) { /* trailing symlink */ - struct path holder; - struct inode *inode = path.dentry->d_inode; + struct path link = path; + struct inode *linki = link.dentry->d_inode; void *cookie; error = -ELOOP; - /* S_ISDIR part is a temporary automount kludge */ - if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) + if (!(nd.flags & LOOKUP_FOLLOW)) goto exit_dput; if (count++ == 32) goto exit_dput; @@ -1835,41 +2531,37 @@ reval: * just set LAST_BIND. */ nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(path.dentry, &nd); + error = security_inode_follow_link(link.dentry, &nd); if (error) goto exit_dput; - error = __do_follow_link(&path, &nd, &cookie); + error = __do_follow_link(&link, &nd, &cookie); if (unlikely(error)) { + if (!IS_ERR(cookie) && linki->i_op->put_link) + linki->i_op->put_link(link.dentry, &nd, cookie); /* nd.path had been dropped */ - if (!IS_ERR(cookie) && inode->i_op->put_link) - inode->i_op->put_link(path.dentry, &nd, cookie); - path_put(&path); - release_open_intent(&nd); - filp = ERR_PTR(error); - goto out; + nd.path = link; + goto out_path; } - holder = path; nd.flags &= ~LOOKUP_PARENT; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (inode->i_op->put_link) - inode->i_op->put_link(holder.dentry, &nd, cookie); - path_put(&holder); + if (linki->i_op->put_link) + linki->i_op->put_link(link.dentry, &nd, cookie); + path_put(&link); } out: if (nd.root.mnt) path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !force_reval) { - force_reval = 1; + if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) goto reval; - } return filp; exit_dput: path_put_conditional(&path, &nd); +out_path: + path_put(&nd.path); +out_filp: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); -exit_parent: - path_put(&nd.path); filp = ERR_PTR(error); goto out; } @@ -2130,12 +2822,10 @@ void dentry_unhash(struct dentry *dentry) { dget(dentry); shrink_dcache_parent(dentry); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) == 2) + if (dentry->d_count == 2) __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } int vfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -2884,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = { }; EXPORT_SYMBOL(user_path_at); +EXPORT_SYMBOL(follow_down_one); EXPORT_SYMBOL(follow_down); EXPORT_SYMBOL(follow_up); EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ diff --git a/fs/namespace.c b/fs/namespace.c index 3dbfc072ec7..7b0b9537169 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -138,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt) mnt->mnt_group_id = 0; } +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct vfsmount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else + preempt_disable(); + mnt->mnt_count += n; + preempt_enable(); +#endif +} + +static inline void mnt_set_count(struct vfsmount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_write(mnt->mnt_pcp->mnt_count, n); +#else + mnt->mnt_count = n; +#endif +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_inc_count(struct vfsmount *mnt) +{ + mnt_add_count(mnt, 1); +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_dec_count(struct vfsmount *mnt) +{ + mnt_add_count(mnt, -1); +} + +/* + * vfsmount lock must be held for write + */ +unsigned int mnt_get_count(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; + } + + return count; +#else + return mnt->mnt_count; +#endif +} + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -154,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name) goto out_free_id; } - atomic_set(&mnt->mnt_count, 1); +#ifdef CONFIG_SMP + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; + + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else + mnt->mnt_count = 1; + mnt->mnt_writers = 0; +#endif + INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); @@ -166,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name) #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif -#ifdef CONFIG_SMP - mnt->mnt_writers = alloc_percpu(int); - if (!mnt->mnt_writers) - goto out_free_devname; -#else - mnt->mnt_writers = 0; -#endif } return mnt; @@ -216,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -static inline void inc_mnt_writers(struct vfsmount *mnt) +static inline void mnt_inc_writers(struct vfsmount *mnt) { #ifdef CONFIG_SMP - (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; + this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } -static inline void dec_mnt_writers(struct vfsmount *mnt) +static inline void mnt_dec_writers(struct vfsmount *mnt) { #ifdef CONFIG_SMP - (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; + this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } -static unsigned int count_mnt_writers(struct vfsmount *mnt) +static unsigned int mnt_get_writers(struct vfsmount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { - count += *per_cpu_ptr(mnt->mnt_writers, cpu); + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; @@ -273,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt) int ret = 0; preempt_disable(); - inc_mnt_writers(mnt); + mnt_inc_writers(mnt); /* - * The store to inc_mnt_writers must be visible before we pass + * The store to mnt_inc_writers must be visible before we pass * MNT_WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set MNT_WRITE_HOLD. */ @@ -289,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt) */ smp_rmb(); if (__mnt_is_readonly(mnt)) { - dec_mnt_writers(mnt); + mnt_dec_writers(mnt); ret = -EROFS; goto out; } @@ -317,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt) if (__mnt_is_readonly(mnt)) return -EROFS; preempt_disable(); - inc_mnt_writers(mnt); + mnt_inc_writers(mnt); preempt_enable(); return 0; } @@ -351,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file); void mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); - dec_mnt_writers(mnt); + mnt_dec_writers(mnt); preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_drop_write); @@ -384,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt) * MNT_WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ - if (count_mnt_writers(mnt) > 0) + if (mnt_get_writers(mnt) > 0) ret = -EBUSY; else mnt->mnt_flags |= MNT_READONLY; @@ -418,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt) kfree(mnt->mnt_devname); mnt_free_id(mnt); #ifdef CONFIG_SMP - free_percpu(mnt->mnt_writers); + free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } @@ -492,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* + * Clear dentry's mounted state if it has no remaining mounts. + * vfsmount_lock must be held for write. + */ +static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) +{ + unsigned u; + + for (u = 0; u < HASH_SIZE; u++) { + struct vfsmount *p; + + list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { + if (p->mnt_mountpoint == dentry) + return; + } + } + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); +} + +/* * vfsmount lock must be held for write */ static void detach_mnt(struct vfsmount *mnt, struct path *old_path) @@ -502,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path) mnt->mnt_mountpoint = mnt->mnt_root; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); - old_path->dentry->d_mounted--; + dentry_reset_mounted(old_path->mnt, old_path->dentry); } /* @@ -513,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, { child_mnt->mnt_parent = mntget(mnt); child_mnt->mnt_mountpoint = dget(dentry); - dentry->d_mounted++; + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); } /* @@ -527,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path) list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); } +static inline void __mnt_make_longterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + atomic_inc(&mnt->mnt_longterm); +#endif +} + +/* needs vfsmount lock for write */ +static inline void __mnt_make_shortterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + atomic_dec(&mnt->mnt_longterm); +#endif +} + /* * vfsmount lock must be held for write */ @@ -540,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) + list_for_each_entry(m, &head, mnt_list) { m->mnt_ns = n; + __mnt_make_longterm(m); + } + list_splice(&head, n->list.prev); list_add_tail(&mnt->mnt_hash, mount_hashtable + @@ -629,9 +731,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, return NULL; } -static inline void __mntput(struct vfsmount *mnt) +static inline void mntfree(struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; + /* * This probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this @@ -639,38 +742,69 @@ static inline void __mntput(struct vfsmount *mnt) * to make r/w->r/o transitions. */ /* - * atomic_dec_and_lock() used to deal with ->mnt_count decrements - * provides barriers, so count_mnt_writers() below is safe. AV + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. */ - WARN_ON(count_mnt_writers(mnt)); + WARN_ON(mnt_get_writers(mnt)); fsnotify_vfsmount_delete(mnt); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); } -void mntput_no_expire(struct vfsmount *mnt) +static void mntput_no_expire(struct vfsmount *mnt) { -repeat: - if (atomic_add_unless(&mnt->mnt_count, -1, 1)) +put_again: +#ifdef CONFIG_SMP + br_read_lock(vfsmount_lock); + if (likely(atomic_read(&mnt->mnt_longterm))) { + mnt_dec_count(mnt); + br_read_unlock(vfsmount_lock); return; + } + br_read_unlock(vfsmount_lock); + br_write_lock(vfsmount_lock); - if (!atomic_dec_and_test(&mnt->mnt_count)) { + mnt_dec_count(mnt); + if (mnt_get_count(mnt)) { br_write_unlock(vfsmount_lock); return; } - if (likely(!mnt->mnt_pinned)) { - br_write_unlock(vfsmount_lock); - __mntput(mnt); +#else + mnt_dec_count(mnt); + if (likely(mnt_get_count(mnt))) return; + br_write_lock(vfsmount_lock); +#endif + if (unlikely(mnt->mnt_pinned)) { + mnt_add_count(mnt, mnt->mnt_pinned + 1); + mnt->mnt_pinned = 0; + br_write_unlock(vfsmount_lock); + acct_auto_close_mnt(mnt); + goto put_again; } - atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); - mnt->mnt_pinned = 0; br_write_unlock(vfsmount_lock); - acct_auto_close_mnt(mnt); - goto repeat; + mntfree(mnt); } -EXPORT_SYMBOL(mntput_no_expire); + +void mntput(struct vfsmount *mnt) +{ + if (mnt) { + /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + if (unlikely(mnt->mnt_expiry_mark)) + mnt->mnt_expiry_mark = 0; + mntput_no_expire(mnt); + } +} +EXPORT_SYMBOL(mntput); + +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) + mnt_inc_count(mnt); + return mnt; +} +EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { @@ -678,19 +812,17 @@ void mnt_pin(struct vfsmount *mnt) mnt->mnt_pinned++; br_write_unlock(vfsmount_lock); } - EXPORT_SYMBOL(mnt_pin); void mnt_unpin(struct vfsmount *mnt) { br_write_lock(vfsmount_lock); if (mnt->mnt_pinned) { - atomic_inc(&mnt->mnt_count); + mnt_inc_count(mnt); mnt->mnt_pinned--; } br_write_unlock(vfsmount_lock); } - EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) @@ -985,12 +1117,13 @@ int may_umount_tree(struct vfsmount *mnt) int minimum_refs = 0; struct vfsmount *p; - br_read_lock(vfsmount_lock); + /* write lock needed for mnt_get_count */ + br_write_lock(vfsmount_lock); for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += atomic_read(&p->mnt_count); + actual_refs += mnt_get_count(p); minimum_refs += 2; } - br_read_unlock(vfsmount_lock); + br_write_unlock(vfsmount_lock); if (actual_refs > minimum_refs) return 0; @@ -1017,10 +1150,10 @@ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); - br_read_lock(vfsmount_lock); + br_write_lock(vfsmount_lock); if (propagate_mount_busy(mnt, 2)) ret = 0; - br_read_unlock(vfsmount_lock); + br_write_unlock(vfsmount_lock); up_read(&namespace_sem); return ret; } @@ -1057,26 +1190,29 @@ void release_mounts(struct list_head *head) */ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { + LIST_HEAD(tmp_list); struct vfsmount *p; for (p = mnt; p; p = next_mnt(p, mnt)) - list_move(&p->mnt_hash, kill); + list_move(&p->mnt_hash, &tmp_list); if (propagate) - propagate_umount(kill); + propagate_umount(&tmp_list); - list_for_each_entry(p, kill, mnt_hash) { + list_for_each_entry(p, &tmp_list, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; + __mnt_make_shortterm(p); list_del_init(&p->mnt_child); if (p->mnt_parent != p) { p->mnt_parent->mnt_ghosts++; - p->mnt_mountpoint->d_mounted--; + dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint); } change_mnt_propagation(p, MS_PRIVATE); } + list_splice(&tmp_list, kill); } static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); @@ -1102,8 +1238,16 @@ static int do_umount(struct vfsmount *mnt, int flags) flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; - if (atomic_read(&mnt->mnt_count) != 2) + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + br_write_lock(vfsmount_lock); + if (mnt_get_count(mnt) != 2) { + br_write_lock(vfsmount_lock); return -EBUSY; + } + br_write_unlock(vfsmount_lock); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; @@ -1667,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name) return err; down_write(&namespace_sem); - while (d_mountpoint(path->dentry) && - follow_down(path)) - ; + err = follow_down(path, true); + if (err < 0) + goto out; + err = -EINVAL; if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) goto out; @@ -1727,6 +1872,8 @@ out: return err; } +static int do_add_mount(struct vfsmount *, struct path *, int); + /* * create a new mount for userspace and request it to be added into the * namespace's tree @@ -1735,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; + int err; if (!type) return -EINVAL; @@ -1747,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); - return do_add_mount(mnt, path, mnt_flags, NULL); + err = do_add_mount(mnt, path, mnt_flags); + if (err) + mntput(mnt); + return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ + int err; + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(m) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == path->dentry) { + err = -ELOOP; + goto fail; + } + + err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (!err) + return 0; +fail: + /* remove m from any expiration list it may be on */ + if (!list_empty(&m->mnt_expire)) { + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + list_del_init(&m->mnt_expire); + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + } + mntput(m); + mntput(m); + return err; } /* * add a mount into a namespace's mount tree - * - provide the option of adding the new mount to an expiration list */ -int do_add_mount(struct vfsmount *newmnt, struct path *path, - int mnt_flags, struct list_head *fslist) +static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) { int err; @@ -1763,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, down_write(&namespace_sem); /* Something was mounted here while we slept */ - while (d_mountpoint(path->dentry) && - follow_down(path)) - ; + err = follow_down(path, true); + if (err < 0) + goto unlock; + err = -EINVAL; if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) goto unlock; @@ -1781,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, goto unlock; newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, path))) - goto unlock; - - if (fslist) /* add to the specified expiration list */ - list_add_tail(&newmnt->mnt_expire, fslist); - - up_write(&namespace_sem); - return 0; + err = graft_tree(newmnt, path); unlock: up_write(&namespace_sem); - mntput(newmnt); return err; } -EXPORT_SYMBOL_GPL(do_add_mount); +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + + list_add_tail(&mnt->mnt_expire, expiry_list); + + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); +} +EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any @@ -2085,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void) return new_ns; } +void mnt_make_longterm(struct vfsmount *mnt) +{ + __mnt_make_longterm(mnt); +} + +void mnt_make_shortterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) + return; + br_write_lock(vfsmount_lock); + atomic_dec(&mnt->mnt_longterm); + br_write_unlock(vfsmount_lock); +#endif +} + /* * Allocate a new namespace structure and populate it with contents * copied from the namespace of the passed in task structure. @@ -2122,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, q = new_ns->root; while (p) { q->mnt_ns = new_ns; + __mnt_make_longterm(q); if (fs) { if (p == fs->root.mnt) { - rootmnt = p; fs->root.mnt = mntget(q); + __mnt_make_longterm(q); + mnt_make_shortterm(p); + rootmnt = p; } if (p == fs->pwd.mnt) { - pwdmnt = p; fs->pwd.mnt = mntget(q); + __mnt_make_longterm(q); + mnt_make_shortterm(p); + pwdmnt = p; } } p = next_mnt(p, mnt_ns->root); @@ -2173,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) new_ns = alloc_mnt_ns(); if (!IS_ERR(new_ns)) { mnt->mnt_ns = new_ns; + __mnt_make_longterm(mnt); new_ns->root = mnt; list_add(&new_ns->list, &new_ns->root->mnt_list); } @@ -2327,6 +2537,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(vfsmount_lock); chroot_fs_refs(&root, &new); + error = 0; path_put(&root_parent); path_put(&parent_path); @@ -2353,6 +2564,7 @@ static void __init init_mount_tree(void) mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); + ns = create_mnt_ns(mnt); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f22b12e7d33..f6946bb5cb5 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -17,12 +17,11 @@ #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/mm.h> +#include <linux/namei.h> #include <asm/uaccess.h> #include <asm/byteorder.h> -#include <linux/ncp_fs.h> - -#include "ncplib_kernel.h" +#include "ncp_fs.h" static void ncp_read_volume_list(struct file *, void *, filldir_t, struct ncp_cache_control *); @@ -74,11 +73,14 @@ const struct inode_operations ncp_dir_inode_operations = * Dentry operations routines */ static int ncp_lookup_validate(struct dentry *, struct nameidata *); -static int ncp_hash_dentry(struct dentry *, struct qstr *); -static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *); -static int ncp_delete_dentry(struct dentry *); - -static const struct dentry_operations ncp_dentry_operations = +static int ncp_hash_dentry(const struct dentry *, const struct inode *, + struct qstr *); +static int ncp_compare_dentry(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); +static int ncp_delete_dentry(const struct dentry *); + +const struct dentry_operations ncp_dentry_operations = { .d_revalidate = ncp_lookup_validate, .d_hash = ncp_hash_dentry, @@ -86,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations = .d_delete = ncp_delete_dentry, }; -const struct dentry_operations ncp_root_dentry_operations = -{ - .d_hash = ncp_hash_dentry, - .d_compare = ncp_compare_dentry, - .d_delete = ncp_delete_dentry, -}; - - #define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) @@ -113,10 +107,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator) #define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS) -static inline int ncp_case_sensitive(struct dentry *dentry) +static inline int ncp_case_sensitive(const struct inode *i) { #ifdef CONFIG_NCPFS_NFS_NS - return ncp_namespace(dentry->d_inode) == NW_NS_NFS; + return ncp_namespace(i) == NW_NS_NFS; #else return 0; #endif /* CONFIG_NCPFS_NFS_NS */ @@ -127,14 +121,16 @@ static inline int ncp_case_sensitive(struct dentry *dentry) * is case-sensitive. */ static int -ncp_hash_dentry(struct dentry *dentry, struct qstr *this) +ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, + struct qstr *this) { - if (!ncp_case_sensitive(dentry)) { + if (!ncp_case_sensitive(inode)) { + struct super_block *sb = dentry->d_sb; struct nls_table *t; unsigned long hash; int i; - t = NCP_IO_TABLE(dentry); + t = NCP_IO_TABLE(sb); hash = init_name_hash(); for (i=0; i<this->len ; i++) hash = partial_name_hash(ncp_tolower(t, this->name[i]), @@ -145,15 +141,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this) } static int -ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) +ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - if (a->len != b->len) + if (len != name->len) return 1; - if (ncp_case_sensitive(dentry)) - return strncmp(a->name, b->name, a->len); + if (ncp_case_sensitive(pinode)) + return strncmp(str, name->name, len); - return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); + return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len); } /* @@ -162,7 +160,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) * Closing files can be safely postponed until iput() - it's done there anyway. */ static int -ncp_delete_dentry(struct dentry * dentry) +ncp_delete_dentry(const struct dentry * dentry) { struct inode *inode = dentry->d_inode; @@ -301,6 +299,12 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd) int res, val = 0, len; __u8 __name[NCP_MAXPATHLEN + 1]; + if (dentry == dentry->d_sb->s_root) + return 1; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + parent = dget_parent(dentry); dir = parent->d_inode; @@ -384,21 +388,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) } /* If a pointer is invalid, we search the dentry. */ - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dent = list_entry(next, struct dentry, d_u.d_child); if ((unsigned long)dent->d_fsdata == fpos) { if (dent->d_inode) - dget_locked(dent); + dget(dent); else dent = NULL; - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); goto out; } next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); return NULL; out: @@ -592,7 +596,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, qname.hash = full_name_hash(qname.name, qname.len); if (dentry->d_op && dentry->d_op->d_hash) - if (dentry->d_op->d_hash(dentry, &qname) != 0) + if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0) goto end_advance; newdent = d_lookup(dentry, &qname); @@ -611,35 +615,12 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, shrink_dcache_parent(newdent); /* - * It is not as dangerous as it looks. NetWare's OS2 namespace is - * case preserving yet case insensitive. So we update dentry's name - * as received from server. We found dentry via d_lookup with our - * hash, so we know that hash does not change, and so replacing name - * should be reasonably safe. + * NetWare's OS2 namespace is case preserving yet case + * insensitive. So we update dentry's name as received from + * server. Parent dir's i_mutex is locked because we're in + * readdir. */ - if (qname.len == newdent->d_name.len && - memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) { - struct inode *inode = newdent->d_inode; - - /* - * Inside ncpfs all uses of d_name are either for debugging, - * or on functions which acquire inode mutex (mknod, creat, - * lookup). So grab i_mutex here, to be sure. d_path - * uses dcache_lock when generating path, so we should too. - * And finally d_compare is protected by dentry's d_lock, so - * here we go. - */ - if (inode) - mutex_lock(&inode->i_mutex); - spin_lock(&dcache_lock); - spin_lock(&newdent->d_lock); - memcpy((char *) newdent->d_name.name, qname.name, - newdent->d_name.len); - spin_unlock(&newdent->d_lock); - spin_unlock(&dcache_lock); - if (inode) - mutex_unlock(&inode->i_mutex); - } + dentry_update_name_case(newdent, &qname); } if (!newdent->d_inode) { @@ -649,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, entry->ino = iunique(dir->i_sb, 2); inode = ncp_iget(dir->i_sb, entry); if (inode) { - newdent->d_op = &ncp_dentry_operations; d_instantiate(newdent, inode); if (!hashed) d_rehash(newdent); @@ -657,7 +637,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, } else { struct inode *inode = newdent->d_inode; - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); mutex_unlock(&inode->i_mutex); } @@ -905,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc if (inode) { ncp_new_dentry(dentry); add_entry: - dentry->d_op = &ncp_dentry_operations; d_add(dentry, inode); error = 0; } diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index cb50aaf981d..0ed65e0c3df 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -18,8 +18,7 @@ #include <linux/vmalloc.h> #include <linux/sched.h> -#include <linux/ncp_fs.h> -#include "ncplib_kernel.h" +#include "ncp_fs.h" static int ncp_fsync(struct file *file, int datasync) { diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 8fb93b604e7..00a1d1c3d3a 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -29,12 +29,11 @@ #include <linux/vfs.h> #include <linux/mount.h> #include <linux/seq_file.h> - -#include <linux/ncp_fs.h> +#include <linux/namei.h> #include <net/sock.h> -#include "ncplib_kernel.h" +#include "ncp_fs.h" #include "getopt.h" #define NCP_DEFAULT_FILE_MODE 0600 @@ -58,11 +57,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ncp_destroy_inode(struct inode *inode) +static void ncp_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); } +static void ncp_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ncp_i_callback); +} + static void init_once(void *foo) { struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; @@ -309,7 +315,12 @@ static void ncp_stop_tasks(struct ncp_server *server) { sk->sk_write_space = server->write_space; release_sock(sk); del_timer_sync(&server->timeout_tm); - flush_scheduled_work(); + + flush_work_sync(&server->rcv.tq); + if (sk->sk_socket->type == SOCK_STREAM) + flush_work_sync(&server->tx.tq); + else + flush_work_sync(&server->timeout_tq); } static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) @@ -531,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_blocksize_bits = 10; sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; + sb->s_d_op = &ncp_dentry_operations; sb->s_bdi = &server->bdi; server = NCP_SBP(sb); @@ -710,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) goto out_no_root; - sb->s_root->d_op = &ncp_root_dentry_operations; return 0; out_no_root: diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index d40a547e337..790e92a9ec6 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -20,11 +20,9 @@ #include <linux/vmalloc.h> #include <linux/sched.h> -#include <linux/ncp_fs.h> - #include <asm/uaccess.h> -#include "ncplib_kernel.h" +#include "ncp_fs.h" /* maximum limit for ncp_objectname_ioctl */ #define NCP_OBJECT_NAME_MAX_LEN 4096 diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 56f5b3a0e1e..a7c07b44b10 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -16,12 +16,12 @@ #include <linux/mman.h> #include <linux/string.h> #include <linux/fcntl.h> -#include <linux/ncp_fs.h> -#include "ncplib_kernel.h" #include <asm/uaccess.h> #include <asm/system.h> +#include "ncp_fs.h" + /* * Fill in the supplied page for mmap * XXX: how are we excluding truncate/invalidate here? Maybe need to lock diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h new file mode 100644 index 00000000000..31831afe1c3 --- /dev/null +++ b/fs/ncpfs/ncp_fs.h @@ -0,0 +1,98 @@ +#include <linux/ncp_fs.h> +#include "ncp_fs_i.h" +#include "ncp_fs_sb.h" + +/* define because it is easy to change PRINTK to {*}PRINTK */ +#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args) + +#undef NCPFS_PARANOIA +#ifdef NCPFS_PARANOIA +#define PPRINTK(format, args...) PRINTK(format , ## args) +#else +#define PPRINTK(format, args...) +#endif + +#ifndef DEBUG_NCP +#define DEBUG_NCP 0 +#endif +#if DEBUG_NCP > 0 +#define DPRINTK(format, args...) PRINTK(format , ## args) +#else +#define DPRINTK(format, args...) +#endif +#if DEBUG_NCP > 1 +#define DDPRINTK(format, args...) PRINTK(format , ## args) +#else +#define DDPRINTK(format, args...) +#endif + +#define NCP_MAX_RPC_TIMEOUT (6*HZ) + + +struct ncp_entry_info { + struct nw_info_struct i; + ino_t ino; + int opened; + int access; + unsigned int volume; + __u8 file_handle[6]; +}; + +static inline struct ncp_server *NCP_SBP(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb) +static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode) +{ + return container_of(inode, struct ncp_inode_info, vfs_inode); +} + +/* linux/fs/ncpfs/inode.c */ +int ncp_notify_change(struct dentry *, struct iattr *); +struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *); +void ncp_update_inode(struct inode *, struct ncp_entry_info *); +void ncp_update_inode2(struct inode *, struct ncp_entry_info *); + +/* linux/fs/ncpfs/dir.c */ +extern const struct inode_operations ncp_dir_inode_operations; +extern const struct file_operations ncp_dir_operations; +extern const struct dentry_operations ncp_dentry_operations; +int ncp_conn_logged_in(struct super_block *); +int ncp_date_dos2unix(__le16 time, __le16 date); +void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date); + +/* linux/fs/ncpfs/ioctl.c */ +long ncp_ioctl(struct file *, unsigned int, unsigned long); +long ncp_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* linux/fs/ncpfs/sock.c */ +int ncp_request2(struct ncp_server *server, int function, + void* reply, int max_reply_size); +static inline int ncp_request(struct ncp_server *server, int function) { + return ncp_request2(server, function, server->packet, server->packet_size); +} +int ncp_connect(struct ncp_server *server); +int ncp_disconnect(struct ncp_server *server); +void ncp_lock_server(struct ncp_server *server); +void ncp_unlock_server(struct ncp_server *server); + +/* linux/fs/ncpfs/symlink.c */ +#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) +extern const struct address_space_operations ncp_symlink_aops; +int ncp_symlink(struct inode*, struct dentry*, const char*); +#endif + +/* linux/fs/ncpfs/file.c */ +extern const struct inode_operations ncp_file_inode_operations; +extern const struct file_operations ncp_file_operations; +int ncp_make_open(struct inode *, int); + +/* linux/fs/ncpfs/mmap.c */ +int ncp_mmap(struct file *, struct vm_area_struct *); + +/* linux/fs/ncpfs/ncplib_kernel.c */ +int ncp_make_closed(struct inode *); + +#include "ncplib_kernel.h" diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h new file mode 100644 index 00000000000..4b0bec47784 --- /dev/null +++ b/fs/ncpfs/ncp_fs_i.h @@ -0,0 +1,29 @@ +/* + * ncp_fs_i.h + * + * Copyright (C) 1995 Volker Lendecke + * + */ + +#ifndef _LINUX_NCP_FS_I +#define _LINUX_NCP_FS_I + +/* + * This is the ncpfs part of the inode structure. This must contain + * all the information we need to work with an inode after creation. + */ +struct ncp_inode_info { + __le32 dirEntNum; + __le32 DosDirNum; + __u8 volNumber; + __le32 nwattr; + struct mutex open_mutex; + atomic_t opened; + int access; + int flags; +#define NCPI_KLUDGE_SYMLINK 0x0001 + __u8 file_handle[6]; + struct inode vfs_inode; +}; + +#endif /* _LINUX_NCP_FS_I */ diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h new file mode 100644 index 00000000000..4af803f1351 --- /dev/null +++ b/fs/ncpfs/ncp_fs_sb.h @@ -0,0 +1,176 @@ +/* + * ncp_fs_sb.h + * + * Copyright (C) 1995, 1996 by Volker Lendecke + * + */ + +#ifndef _NCP_FS_SB +#define _NCP_FS_SB + +#include <linux/types.h> +#include <linux/ncp_mount.h> +#include <linux/net.h> +#include <linux/mutex.h> +#include <linux/backing-dev.h> +#include <linux/workqueue.h> + +#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */ + +struct sock; + +struct ncp_mount_data_kernel { + unsigned long flags; /* NCP_MOUNT_* flags */ + unsigned int int_flags; /* internal flags */ +#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 + __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */ + struct pid *wdog_pid; /* Who cares for our watchdog packets? */ + unsigned int ncp_fd; /* The socket to the ncp port */ + unsigned int time_out; /* How long should I wait after + sending a NCP request? */ + unsigned int retry_count; /* And how often should I retry? */ + unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; + __kernel_uid32_t uid; + __kernel_gid32_t gid; + __kernel_mode_t file_mode; + __kernel_mode_t dir_mode; + int info_fd; +}; + +struct ncp_server { + + struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of + interest for us later, so we store + it completely. */ + + __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; + + struct file *ncp_filp; /* File pointer to ncp socket */ + struct socket *ncp_sock;/* ncp socket */ + struct file *info_filp; + struct socket *info_sock; + + u8 sequence; + u8 task; + u16 connection; /* Remote connection number */ + + u8 completion; /* Status message from server */ + u8 conn_status; /* Bit 4 = 1 ==> Server going down, no + requests allowed anymore. + Bit 0 = 1 ==> Server is down. */ + + int buffer_size; /* Negotiated bufsize */ + + int reply_size; /* Size of last reply */ + + int packet_size; + unsigned char *packet; /* Here we prepare requests and + receive replies */ + unsigned char *txbuf; /* Storage for current request */ + unsigned char *rxbuf; /* Storage for reply to current request */ + + int lock; /* To prevent mismatch in protocols. */ + struct mutex mutex; + + int current_size; /* for packet preparation */ + int has_subfunction; + int ncp_reply_size; + + int root_setuped; + struct mutex root_setup_lock; + + /* info for packet signing */ + int sign_wanted; /* 1=Server needs signed packets */ + int sign_active; /* 0=don't do signing, 1=do */ + char sign_root[8]; /* generated from password and encr. key */ + char sign_last[16]; + + /* Authentication info: NDS or BINDERY, username */ + struct { + int auth_type; + size_t object_name_len; + void* object_name; + int object_type; + } auth; + /* Password info */ + struct { + size_t len; + void* data; + } priv; + struct rw_semaphore auth_rwsem; + + /* nls info: codepage for volume and charset for I/O */ + struct nls_table *nls_vol; + struct nls_table *nls_io; + + /* maximum age in jiffies */ + atomic_t dentry_ttl; + + /* miscellaneous */ + unsigned int flags; + + spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ + + void (*data_ready)(struct sock* sk, int len); + void (*error_report)(struct sock* sk); + void (*write_space)(struct sock* sk); /* STREAM mode only */ + struct { + struct work_struct tq; /* STREAM/DGRAM: data/error ready */ + struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */ + struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */ + + unsigned int state; /* STREAM only: receiver state */ + struct { + __u32 magic __packed; + __u32 len __packed; + __u16 type __packed; + __u16 p1 __packed; + __u16 p2 __packed; + __u16 p3 __packed; + __u16 type2 __packed; + } buf; /* STREAM only: temporary buffer */ + unsigned char* ptr; /* STREAM only: pointer to data */ + size_t len; /* STREAM only: length of data to receive */ + } rcv; + struct { + struct list_head requests; /* STREAM only: queued requests */ + struct work_struct tq; /* STREAM only: transmitter ready */ + struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */ + } tx; + struct timer_list timeout_tm; /* DGRAM only: timeout timer */ + struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */ + int timeout_last; /* DGRAM only: current timeout length */ + int timeout_retries; /* DGRAM only: retries left */ + struct { + size_t len; + __u8 data[128]; + } unexpected_packet; + struct backing_dev_info bdi; +}; + +extern void ncp_tcp_rcv_proc(struct work_struct *work); +extern void ncp_tcp_tx_proc(struct work_struct *work); +extern void ncpdgram_rcv_proc(struct work_struct *work); +extern void ncpdgram_timeout_proc(struct work_struct *work); +extern void ncpdgram_timeout_call(unsigned long server); +extern void ncp_tcp_data_ready(struct sock* sk, int len); +extern void ncp_tcp_write_space(struct sock* sk); +extern void ncp_tcp_error_report(struct sock* sk); + +#define NCP_FLAG_UTF8 1 + +#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag)) +#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag)) +#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag)) + +static inline int ncp_conn_valid(struct ncp_server *server) +{ + return ((server->conn_status & 0x11) == 0); +} + +static inline void ncp_invalidate_conn(struct ncp_server *server) +{ + server->conn_status |= 0x01; +} + +#endif diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index a95615a0b6a..981a95617fc 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -11,7 +11,7 @@ -#include "ncplib_kernel.h" +#include "ncp_fs.h" static inline void assert_server_locked(struct ncp_server *server) { diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 3c57eca634c..09881e6aa5a 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -32,8 +32,6 @@ #include <linux/ctype.h> #endif /* CONFIG_NCPFS_NLS */ -#include <linux/ncp_fs.h> - #define NCP_MIN_SYMLINK_SIZE 8 #define NCP_MAX_SYMLINK_SIZE 512 @@ -135,7 +133,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *, const unsigned char *, unsigned int, int); #define NCP_ESC ':' -#define NCP_IO_TABLE(dentry) (NCP_SERVER((dentry)->d_inode)->nls_io) +#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io) #define ncp_tolower(t, c) nls_tolower(t, c) #define ncp_toupper(t, c) nls_toupper(t, c) #define ncp_strnicmp(t, s1, s2, len) \ @@ -150,15 +148,15 @@ int ncp__io2vol(unsigned char *, unsigned int *, int ncp__vol2io(unsigned char *, unsigned int *, const unsigned char *, unsigned int, int); -#define NCP_IO_TABLE(dentry) NULL +#define NCP_IO_TABLE(sb) NULL #define ncp_tolower(t, c) tolower(c) #define ncp_toupper(t, c) toupper(c) #define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) #define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) -static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1, - const unsigned char *s2, int len) +static inline int ncp_strnicmp(const struct nls_table *t, + const unsigned char *s1, const unsigned char *s2, int len) { while (len--) { if (tolower(*s1++) != tolower(*s2++)) @@ -193,7 +191,7 @@ ncp_renew_dentries(struct dentry *parent) struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dentry = list_entry(next, struct dentry, d_u.d_child); @@ -205,7 +203,7 @@ ncp_renew_dentries(struct dentry *parent) next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } static inline void @@ -215,7 +213,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent) struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dentry = list_entry(next, struct dentry, d_u.d_child); @@ -223,7 +221,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent) ncp_age_dentry(server, dentry); next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } struct ncp_cache_head { diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c index d8b2d7e6910..08907599dcd 100644 --- a/fs/ncpfs/ncpsign_kernel.c +++ b/fs/ncpfs/ncpsign_kernel.c @@ -11,6 +11,7 @@ #include <linux/string.h> #include <linux/ncp.h> #include <linux/bitops.h> +#include "ncp_fs.h" #include "ncpsign_kernel.h" /* i386: 32-bit, little endian, handles mis-alignment */ diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h index 6451a68381c..d9a1438bb1f 100644 --- a/fs/ncpfs/ncpsign_kernel.h +++ b/fs/ncpfs/ncpsign_kernel.h @@ -8,8 +8,6 @@ #ifndef _NCPSIGN_KERNEL_H #define _NCPSIGN_KERNEL_H -#include <linux/ncp_fs.h> - #ifdef CONFIG_NCPFS_PACKET_SIGNING void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 668bd267346..3a1587222c8 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -28,7 +28,7 @@ #include <linux/poll.h> #include <linux/file.h> -#include <linux/ncp_fs.h> +#include "ncp_fs.h" #include "ncpsign_kernel.h" diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index c634fd17b33..661f861d80c 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -25,13 +25,11 @@ #include <linux/errno.h> #include <linux/fs.h> -#include <linux/ncp_fs.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/stat.h> -#include "ncplib_kernel.h" - +#include "ncp_fs.h" /* these magic numbers must appear in the symlink file -- this makes it a bit more resilient against the magic attributes being set on random files. */ diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 93a8b3bd69e..199016528fc 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -16,9 +16,7 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/sunrpc/svcauth_gss.h> -#if defined(CONFIG_NFS_V4_1) #include <linux/sunrpc/bc_xprt.h> -#endif #include <net/inet_sock.h> @@ -137,6 +135,33 @@ out_err: #if defined(CONFIG_NFS_V4_1) /* + * * CB_SEQUENCE operations will fail until the callback sessionid is set. + * */ +int nfs4_set_callback_sessionid(struct nfs_client *clp) +{ + struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv; + struct nfs4_sessionid *bc_sid; + + if (!serv->sv_bc_xprt) + return -EINVAL; + + /* on success freed in xprt_free */ + bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL); + if (!bc_sid) + return -ENOMEM; + memcpy(bc_sid->data, &clp->cl_session->sess_id.data, + NFS4_MAX_SESSIONID_LEN); + spin_lock_bh(&serv->sv_cb_lock); + serv->sv_bc_xprt->xpt_bc_sid = bc_sid; + spin_unlock_bh(&serv->sv_cb_lock); + dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__, + ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1], + ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3], + serv->sv_bc_xprt); + return 0; +} + +/* * The callback service for NFSv4.1 callbacks */ static int @@ -177,30 +202,38 @@ nfs41_callback_svc(void *vrqstp) struct svc_rqst * nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) { - struct svc_xprt *bc_xprt; - struct svc_rqst *rqstp = ERR_PTR(-ENOMEM); + struct svc_rqst *rqstp; + int ret; - dprintk("--> %s\n", __func__); - /* Create a svc_sock for the service */ - bc_xprt = svc_sock_create(serv, xprt->prot); - if (!bc_xprt) + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); + if (ret < 0) { + rqstp = ERR_PTR(ret); goto out; + } /* * Save the svc_serv in the transport so that it can * be referenced when the session backchannel is initialized */ - serv->bc_xprt = bc_xprt; xprt->bc_serv = serv; INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); - if (IS_ERR(rqstp)) - svc_sock_destroy(bc_xprt); + if (IS_ERR(rqstp)) { + svc_xprt_put(serv->sv_bc_xprt); + serv->sv_bc_xprt = NULL; + } out: - dprintk("--> %s return %p\n", __func__, rqstp); + dprintk("--> %s return %ld\n", __func__, + IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); return rqstp; } @@ -233,6 +266,10 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct nfs_callback_data *cb_info) { } +int nfs4_set_callback_sessionid(struct nfs_client *clp) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -328,6 +365,9 @@ static int check_gss_callback_principal(struct nfs_client *clp, struct rpc_clnt *r = clp->cl_rpcclient; char *p = svc_gss_principal(rqstp); + /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ + if (clp->cl_minorversion != 0) + return SVC_DROP; /* * It might just be a normal user principal, in which case * userspace won't bother to tell us the name at all. @@ -345,6 +385,23 @@ static int check_gss_callback_principal(struct nfs_client *clp, return SVC_OK; } +/* pg_authenticate method helper */ +static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp) +{ + struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp); + int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0; + + dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc); + if (svc_is_backchannel(rqstp)) + /* Sessionid (usually) set after CB_NULL ping */ + return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid, + is_cb_compound); + else + /* No callback identifier in pg_authenticate */ + return nfs4_find_client_no_ident(svc_addr(rqstp)); +} + +/* pg_authenticate method for nfsv4 callback threads. */ static int nfs_callback_authenticate(struct svc_rqst *rqstp) { struct nfs_client *clp; @@ -352,7 +409,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) int ret = SVC_OK; /* Don't talk to strangers */ - clp = nfs_find_client(svc_addr(rqstp), 4); + clp = nfs_cb_find_client(rqstp); if (clp == NULL) return SVC_DROP; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 85a7cfd1b8d..d3b44f9bd74 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -34,10 +34,17 @@ enum nfs4_callback_opnum { OP_CB_ILLEGAL = 10044, }; +struct cb_process_state { + __be32 drc_status; + struct nfs_client *clp; + struct nfs4_sessionid *svc_sid; /* v4.1 callback service sessionid */ +}; + struct cb_compound_hdr_arg { unsigned int taglen; const char *tag; unsigned int minorversion; + unsigned int cb_ident; /* v4.0 callback identifier */ unsigned nops; }; @@ -103,14 +110,23 @@ struct cb_sequenceres { uint32_t csr_target_highestslotid; }; -extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, - struct cb_sequenceres *res); +extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps); extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid); #define RCA4_TYPE_MASK_RDATA_DLG 0 #define RCA4_TYPE_MASK_WDATA_DLG 1 +#define RCA4_TYPE_MASK_DIR_DLG 2 +#define RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 +#define RCA4_TYPE_MASK_ALL 0xf31f struct cb_recallanyargs { struct sockaddr *craa_addr; @@ -118,25 +134,52 @@ struct cb_recallanyargs { uint32_t craa_type_mask; }; -extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); +extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, + void *dummy, + struct cb_process_state *cps); struct cb_recallslotargs { struct sockaddr *crsa_addr; uint32_t crsa_target_max_slots; }; -extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, - void *dummy); +extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_layoutrecallargs { + struct sockaddr *cbl_addr; + uint32_t cbl_recall_type; + uint32_t cbl_layout_type; + uint32_t cbl_layoutchanged; + union { + struct { + struct nfs_fh cbl_fh; + struct pnfs_layout_range cbl_range; + nfs4_stateid cbl_stateid; + }; + struct nfs_fsid cbl_fsid; + }; +}; -#endif /* CONFIG_NFS_V4_1 */ +extern unsigned nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps); -extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); -extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); +extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); +extern void nfs4_cb_take_slot(struct nfs_client *clp); +#endif /* CONFIG_NFS_V4_1 */ +extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps); +extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps); #ifdef CONFIG_NFS_V4 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); extern void nfs_callback_down(int minorversion); extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid); +extern int nfs4_set_callback_sessionid(struct nfs_client *clp); #endif /* CONFIG_NFS_V4 */ /* * nfs41: Callbacks are expected to not cause substantial latency, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2950fca0c61..4bb91cb2620 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -12,30 +12,33 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" #ifdef NFS_DEBUG #define NFSDBG_FACILITY NFSDBG_CALLBACK #endif - -__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) + +__be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps) { - struct nfs_client *clp; struct nfs_delegation *delegation; struct nfs_inode *nfsi; struct inode *inode; + res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + res->bitmap[0] = res->bitmap[1] = 0; res->status = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); - if (clp == NULL) - goto out; dprintk("NFS: GETATTR callback request from %s\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - inode = nfs_delegation_find_inode(clp, &args->fh); + inode = nfs_delegation_find_inode(cps->clp, &args->fh); if (inode == NULL) - goto out_putclient; + goto out; nfsi = NFS_I(inode); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); @@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * out_iput: rcu_read_unlock(); iput(inode); -out_putclient: - nfs_put_client(clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); return res->status; } -__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) +__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps) { - struct nfs_client *clp; struct inode *inode; __be32 res; - res = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); - if (clp == NULL) + res = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ goto out; dprintk("NFS: RECALL callback request from %s\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - - do { - struct nfs_client *prev = clp; - - inode = nfs_delegation_find_inode(clp, &args->fh); - if (inode != NULL) { - /* Set up a helper thread to actually return the delegation */ - switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { - case 0: - res = 0; - break; - case -ENOENT: - if (res != 0) - res = htonl(NFS4ERR_BAD_STATEID); - break; - default: - res = htonl(NFS4ERR_RESOURCE); - } - iput(inode); - } - clp = nfs_find_client_next(prev); - nfs_put_client(prev); - } while (clp != NULL); + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + res = htonl(NFS4ERR_BADHANDLE); + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + /* Set up a helper thread to actually return the delegation */ + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; + case -ENOENT: + if (res != 0) + res = htonl(NFS4ERR_BAD_STATEID); + break; + default: + res = htonl(NFS4ERR_RESOURCE); + } + iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); return res; @@ -113,6 +108,139 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf #if defined(CONFIG_NFS_V4_1) +static u32 initiate_file_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct pnfs_layout_hdr *lo; + struct inode *ino; + bool found = false; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + LIST_HEAD(free_me_list); + + spin_lock(&clp->cl_lock); + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + if (nfs_compare_fh(&args->cbl_fh, + &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + continue; + found = true; + /* Without this, layout can be freed as soon + * as we release cl_lock. + */ + get_layout_hdr(lo); + break; + } + spin_unlock(&clp->cl_lock); + if (!found) + return NFS4ERR_NOMATCHING_LAYOUT; + + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + mark_matching_lsegs_invalid(lo, &free_me_list, + args->cbl_range.iomode)) + rv = NFS4ERR_DELAY; + else + rv = NFS4ERR_NOMATCHING_LAYOUT; + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + put_layout_hdr(lo); + iput(ino); + return rv; +} + +static u32 initiate_bulk_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct pnfs_layout_hdr *lo; + struct inode *ino; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + struct pnfs_layout_hdr *tmp; + LIST_HEAD(recall_list); + LIST_HEAD(free_me_list); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + spin_lock(&clp->cl_lock); + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + if ((args->cbl_recall_type == RETURN_FSID) && + memcmp(&NFS_SERVER(lo->plh_inode)->fsid, + &args->cbl_fsid, sizeof(struct nfs_fsid))) + continue; + if (!igrab(lo->plh_inode)) + continue; + get_layout_hdr(lo); + BUG_ON(!list_empty(&lo->plh_bulk_recall)); + list_add(&lo->plh_bulk_recall, &recall_list); + } + spin_unlock(&clp->cl_lock); + list_for_each_entry_safe(lo, tmp, + &recall_list, plh_bulk_recall) { + ino = lo->plh_inode; + spin_lock(&ino->i_lock); + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) + rv = NFS4ERR_DELAY; + list_del_init(&lo->plh_bulk_recall); + spin_unlock(&ino->i_lock); + put_layout_hdr(lo); + iput(ino); + } + pnfs_free_lseg_list(&free_me_list); + return rv; +} + +static u32 do_callback_layoutrecall(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + u32 res = NFS4ERR_DELAY; + + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); + if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) + goto out; + if (args->cbl_recall_type == RETURN_FILE) + res = initiate_file_draining(clp, args); + else + res = initiate_bulk_draining(clp, args); + clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); +out: + dprintk("%s returning %i\n", __func__, res); + return res; + +} + +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps) +{ + u32 res; + + dprintk("%s: -->\n", __func__); + + if (cps->clp) + res = do_callback_layoutrecall(cps->clp, args); + else + res = NFS4ERR_OP_NOT_IN_SESSION; + + dprintk("%s: exit with status = %d\n", __func__, res); + return cpu_to_be32(res); +} + +static void pnfs_recall_all_layouts(struct nfs_client *clp) +{ + struct cb_layoutrecallargs args; + + /* Pretend we got a CB_LAYOUTRECALL(ALL) */ + memset(&args, 0, sizeof(args)); + args.cbl_recall_type = RETURN_ALL; + /* FIXME we ignore errors, what should we do? */ + do_callback_layoutrecall(clp, &args); +} + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) { if (delegation == NULL) @@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) } /* - * Returns a pointer to a held 'struct nfs_client' that matches the server's - * address, major version number, and session ID. It is the caller's - * responsibility to release the returned reference. - * - * Returns NULL if there are no connections with sessions, or if no session - * matches the one of interest. - */ - static struct nfs_client *find_client_with_session( - const struct sockaddr *addr, u32 nfsversion, - struct nfs4_sessionid *sessionid) -{ - struct nfs_client *clp; - - clp = nfs_find_client(addr, 4); - if (clp == NULL) - return NULL; - - do { - struct nfs_client *prev = clp; - - if (clp->cl_session != NULL) { - if (memcmp(clp->cl_session->sess_id.data, - sessionid->data, - NFS4_MAX_SESSIONID_LEN) == 0) { - /* Returns a held reference to clp */ - return clp; - } - } - clp = nfs_find_client_next(prev); - nfs_put_client(prev); - } while (clp != NULL); - - return NULL; -} - -/* * For each referring call triple, check the session's slot table for * a match. If the slot is in use and the sequence numbers match, the * client is still waiting for a response to the original request. @@ -276,20 +368,34 @@ out: } __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, - struct cb_sequenceres *res) + struct cb_sequenceres *res, + struct cb_process_state *cps) { struct nfs_client *clp; int i; __be32 status; + cps->clp = NULL; + status = htonl(NFS4ERR_BADSESSION); - clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); + /* Incoming session must match the callback session */ + if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN)) + goto out; + + clp = nfs4_find_client_sessionid(args->csa_addr, + &args->csa_sessionid, 1); if (clp == NULL) goto out; + /* state manager is resetting the session */ + if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + status = NFS4ERR_DELAY; + goto out; + } + status = validate_seqid(&clp->cl_session->bc_slot_table, args); if (status) - goto out_putclient; + goto out; /* * Check for pending referring calls. If a match is found, a @@ -298,7 +404,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, */ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { status = htonl(NFS4ERR_DELAY); - goto out_putclient; + goto out; } memcpy(&res->csr_sessionid, &args->csa_sessionid, @@ -307,83 +413,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, res->csr_slotid = args->csa_slotid; res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + nfs4_cb_take_slot(clp); + cps->clp = clp; /* put in nfs4_callback_compound */ -out_putclient: - nfs_put_client(clp); out: for (i = 0; i < args->csa_nrclists; i++) kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); - if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) - res->csr_status = 0; - else + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + cps->drc_status = status; + status = 0; + } else res->csr_status = status; + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, ntohl(status), ntohl(res->csr_status)); return status; } -__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) +static bool +validate_bitmap_values(unsigned long mask) +{ + return (mask & ~RCA4_TYPE_MASK_ALL) == 0; +} + +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, + struct cb_process_state *cps) { - struct nfs_client *clp; __be32 status; fmode_t flags = 0; - status = htonl(NFS4ERR_OP_NOT_IN_SESSION); - clp = nfs_find_client(args->craa_addr, 4); - if (clp == NULL) + status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ goto out; dprintk("NFS: RECALL_ANY callback request from %s\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + status = cpu_to_be32(NFS4ERR_INVAL); + if (!validate_bitmap_values(args->craa_type_mask)) + goto out; + status = cpu_to_be32(NFS4_OK); if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) &args->craa_type_mask)) flags = FMODE_READ; if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) &args->craa_type_mask)) flags |= FMODE_WRITE; - + if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) + &args->craa_type_mask)) + pnfs_recall_all_layouts(cps->clp); if (flags) - nfs_expire_all_delegation_types(clp, flags); - status = htonl(NFS4_OK); + nfs_expire_all_delegation_types(cps->clp, flags); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } /* Reduce the fore channel's max_slots to the target value */ -__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, + struct cb_process_state *cps) { - struct nfs_client *clp; struct nfs4_slot_table *fc_tbl; __be32 status; status = htonl(NFS4ERR_OP_NOT_IN_SESSION); - clp = nfs_find_client(args->crsa_addr, 4); - if (clp == NULL) + if (!cps->clp) /* set in cb_sequence */ goto out; dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), args->crsa_target_max_slots); - fc_tbl = &clp->cl_session->fc_slot_table; + fc_tbl = &cps->clp->cl_session->fc_slot_table; status = htonl(NFS4ERR_BAD_HIGH_SLOT); if (args->crsa_target_max_slots > fc_tbl->max_slots || args->crsa_target_max_slots < 1) - goto out_putclient; + goto out; status = htonl(NFS4_OK); if (args->crsa_target_max_slots == fc_tbl->max_slots) - goto out_putclient; + goto out; fc_tbl->target_max_slots = args->crsa_target_max_slots; - nfs41_handle_recall_slot(clp); -out_putclient: - nfs_put_client(clp); /* balance nfs_find_client */ + nfs41_handle_recall_slot(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 05af212f0ed..23112c263f8 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -10,8 +10,10 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/slab.h> +#include <linux/sunrpc/bc_xprt.h> #include "nfs4_fs.h" #include "callback.h" +#include "internal.h" #define CB_OP_TAGLEN_MAXSZ (512) #define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) @@ -22,6 +24,7 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -33,7 +36,8 @@ /* Internal error code */ #define NFS4ERR_RESOURCE_HDR 11050 -typedef __be32 (*callback_process_op_t)(void *, void *); +typedef __be32 (*callback_process_op_t)(void *, void *, + struct cb_process_state *); typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); @@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound hdr->minorversion = ntohl(*p++); /* Check minor version is zero or one. */ if (hdr->minorversion <= 1) { - p++; /* skip callback_ident */ + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ } else { printk(KERN_WARNING "%s: NFSv4 server callback with " "illegal minor version %u!\n", @@ -220,6 +224,66 @@ out: #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_layoutrecallargs *args) +{ + __be32 *p; + __be32 status = 0; + uint32_t iomode; + + args->cbl_addr = svc_addr(rqstp); + p = read_buf(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->cbl_layout_type = ntohl(*p++); + /* Depite the spec's xdr, iomode really belongs in the FILE switch, + * as it is unuseable and ignored with the other types. + */ + iomode = ntohl(*p++); + args->cbl_layoutchanged = ntohl(*p++); + args->cbl_recall_type = ntohl(*p++); + + if (args->cbl_recall_type == RETURN_FILE) { + args->cbl_range.iomode = iomode; + status = decode_fh(xdr, &args->cbl_fh); + if (unlikely(status != 0)) + goto out; + + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_range.offset); + p = xdr_decode_hyper(p, &args->cbl_range.length); + status = decode_stateid(xdr, &args->cbl_stateid); + if (unlikely(status != 0)) + goto out; + } else if (args->cbl_recall_type == RETURN_FSID) { + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_fsid.major); + p = xdr_decode_hyper(p, &args->cbl_fsid.minor); + } else if (args->cbl_recall_type != RETURN_ALL) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", + __func__, + args->cbl_layout_type, iomode, + args->cbl_layoutchanged, args->cbl_recall_type); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + static __be32 decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) { @@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_SEQUENCE: case OP_CB_RECALL_ANY: case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: *op = &callback_ops[op_nr]; break; - case OP_CB_LAYOUTRECALL: case OP_CB_NOTIFY_DEVICEID: case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: @@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) return htonl(NFS_OK); } +static void nfs4_callback_free_slot(struct nfs4_session *session) +{ + struct nfs4_slot_table *tbl = &session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* + * Let the state manager know callback processing done. + * A single slot, so highest used slotid is either 0 or -1 + */ + tbl->highest_used_slotid--; + nfs4_check_drain_bc_complete(session); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_cb_free_slot(struct nfs_client *clp) +{ + if (clp && clp->cl_session) + nfs4_callback_free_slot(clp->cl_session); +} + +/* A single slot, so highest used slotid is either 0 or -1 */ +void nfs4_cb_take_slot(struct nfs_client *clp) +{ + struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + tbl->highest_used_slotid++; + BUG_ON(tbl->highest_used_slotid != 0); + spin_unlock(&tbl->slot_tbl_lock); +} + #else /* CONFIG_NFS_V4_1 */ static __be32 @@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } +static void nfs4_cb_free_slot(struct nfs_client *clp) +{ +} #endif /* CONFIG_NFS_V4_1 */ static __be32 @@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) static __be32 process_op(uint32_t minorversion, int nop, struct svc_rqst *rqstp, struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp, int* drc_status) + struct xdr_stream *xdr_out, void *resp, + struct cb_process_state *cps) { struct callback_op *op = &callback_ops[0]; unsigned int op_nr; @@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop, if (status) goto encode_hdr; - if (*drc_status) { - status = *drc_status; + if (cps->drc_status) { + status = cps->drc_status; goto encode_hdr; } @@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop, if (maxlen > 0 && maxlen < PAGE_SIZE) { status = op->decode_args(rqstp, xdr_in, argp); if (likely(status == 0)) - status = op->process_op(argp, resp); + status = op->process_op(argp, resp, cps); } else status = htonl(NFS4ERR_RESOURCE); - /* Only set by OP_CB_SEQUENCE processing */ - if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { - *drc_status = status; - status = 0; - } - encode_hdr: res = encode_op_hdr(xdr_out, op_nr, status); if (unlikely(res)) @@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_compound_hdr_arg hdr_arg = { 0 }; struct cb_compound_hdr_res hdr_res = { NULL }; struct xdr_stream xdr_in, xdr_out; - __be32 *p; - __be32 status, drc_status = 0; + __be32 *p, status; + struct cb_process_state cps = { + .drc_status = 0, + .clp = NULL, + }; unsigned int nops = 0; dprintk("%s: start\n", __func__); @@ -696,6 +792,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (status == __constant_htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; + if (hdr_arg.minorversion == 0) { + cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); + if (!cps.clp) + return rpc_drop_reply; + } else + cps.svc_sid = bc_xprt_sid(rqstp); + hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) @@ -703,7 +806,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r while (status == 0 && nops != hdr_arg.nops) { status = process_op(hdr_arg.minorversion, nops, rqstp, - &xdr_in, argp, &xdr_out, resp, &drc_status); + &xdr_in, argp, &xdr_out, resp, &cps); nops++; } @@ -716,6 +819,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.status = status; *hdr_res.nops = htonl(nops); + nfs4_cb_free_slot(cps.clp); + nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; } @@ -739,6 +844,12 @@ static struct callback_op callback_ops[] = { .res_maxsize = CB_OP_RECALL_RES_MAXSZ, }, #if defined(CONFIG_NFS_V4_1) + [OP_CB_LAYOUTRECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, + .decode_args = + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, [OP_CB_SEQUENCE] = { .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0870d0d4efc..192f2f86026 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -56,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock); static LIST_HEAD(nfs_client_list); static LIST_HEAD(nfs_volume_list); static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); +#ifdef CONFIG_NFS_V4 +static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */ + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ + int ret = 0; + + if (clp->rpc_ops->version != 4 || minorversion != 0) + return ret; +retry: + if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL)) + return -ENOMEM; + spin_lock(&nfs_client_lock); + ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident); + spin_unlock(&nfs_client_lock); + if (ret == -EAGAIN) + goto retry; + return ret; +} +#endif /* CONFIG_NFS_V4 */ /* * RPC cruft for NFS @@ -144,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_proto = cl_init->proto; #ifdef CONFIG_NFS_V4 - INIT_LIST_HEAD(&clp->cl_delegations); + err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); + if (err) + goto error_cleanup; + spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); @@ -170,21 +197,17 @@ error_0: } #ifdef CONFIG_NFS_V4 -/* - * Clears/puts all minor version specific parts from an nfs_client struct - * reverting it to minorversion 0. - */ -static void nfs4_clear_client_minor_version(struct nfs_client *clp) -{ #ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(clp)) { +static void nfs4_shutdown_session(struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) nfs4_destroy_session(clp->cl_session); - clp->cl_session = NULL; - } - - clp->cl_mvops = nfs_v4_minor_ops[0]; -#endif /* CONFIG_NFS_V4_1 */ } +#else /* CONFIG_NFS_V4_1 */ +static void nfs4_shutdown_session(struct nfs_client *clp) +{ +} +#endif /* CONFIG_NFS_V4_1 */ /* * Destroy the NFS4 callback service @@ -199,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) nfs4_kill_renewd(clp); - nfs4_clear_client_minor_version(clp); + nfs4_shutdown_session(clp); nfs4_destroy_callback(clp); if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) nfs_idmap_delete(clp); rpc_destroy_wait_queue(&clp->cl_rpcwaitq); } + +/* idr_remove_all is not needed as all id's are removed by nfs_put_client */ +void nfs_cleanup_cb_ident_idr(void) +{ + idr_destroy(&cb_ident_idr); +} + +/* nfs_client_lock held */ +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ + if (clp->cl_cb_ident) + idr_remove(&cb_ident_idr, clp->cl_cb_ident); +} + +static void pnfs_init_server(struct nfs_server *server) +{ + rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); +} + #else static void nfs4_shutdown_client(struct nfs_client *clp) { } + +void nfs_cleanup_cb_ident_idr(void) +{ +} + +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ +} + +static void pnfs_init_server(struct nfs_server *server) +{ +} + #endif /* CONFIG_NFS_V4 */ /* @@ -248,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp) if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { list_del(&clp->cl_share_link); + nfs_cb_idr_remove_locked(clp); spin_unlock(&nfs_client_lock); BUG_ON(!list_empty(&clp->cl_superblocks)); @@ -363,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, return 0; } -/* - * Find a client by IP address and protocol version - * - returns NULL if no such client - */ -struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) -{ - struct nfs_client *clp; - - spin_lock(&nfs_client_lock); - list_for_each_entry(clp, &nfs_client_list, cl_share_link) { - struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; - - /* Don't match clients that failed to initialise properly */ - if (!(clp->cl_cons_state == NFS_CS_READY || - clp->cl_cons_state == NFS_CS_SESSION_INITING)) - continue; - - /* Different NFS versions cannot share the same nfs_client */ - if (clp->rpc_ops->version != nfsversion) - continue; - - /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(addr, clap)) - continue; - - atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); - return clp; - } - spin_unlock(&nfs_client_lock); - return NULL; -} - -/* - * Find a client by IP address and protocol version - * - returns NULL if no such client - */ -struct nfs_client *nfs_find_client_next(struct nfs_client *clp) +/* Common match routine for v4.0 and v4.1 callback services */ +bool +nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, + u32 minorversion) { - struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr; - u32 nfsvers = clp->rpc_ops->version; + struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; - spin_lock(&nfs_client_lock); - list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) { - struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + /* Don't match clients that failed to initialise */ + if (!(clp->cl_cons_state == NFS_CS_READY || + clp->cl_cons_state == NFS_CS_SESSION_INITING)) + return false; - /* Don't match clients that failed to initialise properly */ - if (clp->cl_cons_state != NFS_CS_READY) - continue; + /* Match the version and minorversion */ + if (clp->rpc_ops->version != 4 || + clp->cl_minorversion != minorversion) + return false; - /* Different NFS versions cannot share the same nfs_client */ - if (clp->rpc_ops->version != nfsvers) - continue; + /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(addr, clap)) + return false; - /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(sap, clap)) - continue; - - atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); - return clp; - } - spin_unlock(&nfs_client_lock); - return NULL; + return true; } /* @@ -988,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve target->options = source->options; } +static void nfs_server_insert_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&nfs_client_lock); + list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + +} + +static void nfs_server_remove_lists(struct nfs_server *server) +{ + spin_lock(&nfs_client_lock); + list_del_rcu(&server->client_link); + list_del(&server->master_link); + spin_unlock(&nfs_client_lock); + + synchronize_rcu(); +} + /* * Allocate and initialise a server record */ @@ -1004,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void) /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); + INIT_LIST_HEAD(&server->delegations); atomic_set(&server->active, 0); @@ -1019,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void) return NULL; } + pnfs_init_server(server); + return server; } @@ -1029,11 +1067,8 @@ void nfs_free_server(struct nfs_server *server) { dprintk("--> nfs_free_server()\n"); + nfs_server_remove_lists(server); unset_pnfs_layoutdriver(server); - spin_lock(&nfs_client_lock); - list_del(&server->client_link); - list_del(&server->master_link); - spin_unlock(&nfs_client_lock); if (server->destroy != NULL) server->destroy(server); @@ -1108,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - + nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr); return server; @@ -1125,6 +1156,101 @@ error: #ifdef CONFIG_NFS_V4 /* + * NFSv4.0 callback thread helper + * + * Find a client by IP address, protocol version, and minorversion + * + * Called from the pg_authenticate method. The callback identifier + * is not used as it has not been decoded. + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_no_ident(const struct sockaddr *addr) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 0) == false) + continue; + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; + } + spin_unlock(&nfs_client_lock); + return NULL; +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(int cb_ident) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + clp = idr_find(&cb_ident_idr, cb_ident); + if (clp) + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service + * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL + * can arrive before the callback sessionid is set. For CB_NULL calls, + * find a client by IP address protocol version, and minorversion. + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *addr, + struct nfs4_sessionid *sid, int is_cb_compound) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 1) == false) + continue; + + if (!nfs4_has_session(clp)) + continue; + + /* Match sessionid unless cb_null call*/ + if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0)) + continue; + + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; + } + spin_unlock(&nfs_client_lock); + return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *addr, + struct nfs4_sessionid *sid, int is_cb_compound) +{ + return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* * Initialize the NFS4 callback service */ static int nfs4_init_callback(struct nfs_client *clp) @@ -1342,11 +1468,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - + nfs_server_insert_lists(server); server->mount_time = jiffies; out: nfs_free_fattr(fattr); @@ -1551,11 +1673,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (error < 0) goto out_free_server; - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - + nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr_fsinfo); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 1fd62fc49be..364e4328f39 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -40,11 +40,23 @@ static void nfs_free_delegation(struct nfs_delegation *delegation) call_rcu(&delegation->rcu, nfs_free_delegation_callback); } +/** + * nfs_mark_delegation_referenced - set delegation's REFERENCED flag + * @delegation: delegation to process + * + */ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) { set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +/** + * nfs_have_delegation - check if inode has a delegation + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ int nfs_have_delegation(struct inode *inode, fmode_t flags) { struct nfs_delegation *delegation; @@ -119,10 +131,15 @@ again: return 0; } -/* - * Set up a delegation on an inode +/** + * nfs_inode_reclaim_delegation - process a delegation reclaim request + * @inode: inode to process + * @cred: credential to use for request + * @res: new delegation state from server + * */ -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + struct nfs_openres *res) { struct nfs_delegation *delegation; struct rpc_cred *oldcred = NULL; @@ -175,38 +192,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation return inode; } -static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, - const nfs4_stateid *stateid, - struct nfs_client *clp) +static struct nfs_delegation * +nfs_detach_delegation_locked(struct nfs_inode *nfsi, + struct nfs_server *server) { struct nfs_delegation *delegation = rcu_dereference_protected(nfsi->delegation, - lockdep_is_held(&clp->cl_lock)); + lockdep_is_held(&server->nfs_client->cl_lock)); if (delegation == NULL) goto nomatch; + spin_lock(&delegation->lock); - if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) - goto nomatch_unlock; list_del_rcu(&delegation->super_list); delegation->inode = NULL; nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); return delegation; -nomatch_unlock: - spin_unlock(&delegation->lock); nomatch: return NULL; } -/* - * Set up a delegation on an inode +static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, + struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(nfsi, server); + spin_unlock(&clp->cl_lock); + return delegation; +} + +/** + * nfs_inode_set_delegation - set up a delegation on an inode + * @inode: inode to which delegation applies + * @cred: cred to use for subsequent delegation processing + * @res: new delegation state from server + * + * Returns zero on success, or a negative errno value. */ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation, *old_delegation; struct nfs_delegation *freeme = NULL; @@ -227,7 +258,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct spin_lock(&clp->cl_lock); old_delegation = rcu_dereference_protected(nfsi->delegation, - lockdep_is_held(&clp->cl_lock)); + lockdep_is_held(&clp->cl_lock)); if (old_delegation != NULL) { if (memcmp(&delegation->stateid, &old_delegation->stateid, sizeof(old_delegation->stateid)) == 0 && @@ -246,9 +277,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, NULL, clp); + freeme = nfs_detach_delegation_locked(nfsi, server); } - list_add_rcu(&delegation->super_list, &clp->cl_delegations); + list_add_rcu(&delegation->super_list, &server->delegations); nfsi->delegation_state = delegation->type; rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -290,73 +321,85 @@ out: return err; } -/* - * Return all delegations that have been marked for return +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Returns zero on success, or a negative errno value. */ int nfs_client_return_marked_delegations(struct nfs_client *clp) { struct nfs_delegation *delegation; + struct nfs_server *server; struct inode *inode; int err = 0; restart: rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) - continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) - continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) { - filemap_flush(inode->i_mapping); - err = __nfs_inode_return_delegation(inode, delegation, 0); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (!test_and_clear_bit(NFS_DELEGATION_RETURN, + &delegation->flags)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + server); + rcu_read_unlock(); + + if (delegation != NULL) { + filemap_flush(inode->i_mapping); + err = __nfs_inode_return_delegation(inode, + delegation, 0); + } + iput(inode); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; } - iput(inode); - if (!err) - goto restart; - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); - return err; } rcu_read_unlock(); return 0; } -/* - * This function returns the delegation without reclaiming opens - * or protecting against delegation reclaims. - * It is therefore really only safe to be called from - * nfs4_clear_inode() +/** + * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * @inode: inode to process + * + * Does not protect against delegation reclaims, therefore really only safe + * to be called from nfs4_clear_inode(). */ void nfs_inode_return_delegation_noreclaim(struct inode *inode) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; if (rcu_access_pointer(nfsi->delegation) != NULL) { - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); - spin_unlock(&clp->cl_lock); + delegation = nfs_detach_delegation(nfsi, server); if (delegation != NULL) nfs_do_return_delegation(inode, delegation, 0); } } +/** + * nfs_inode_return_delegation - synchronously return a delegation + * @inode: inode to process + * + * Returns zero on success, or a negative errno value. + */ int nfs_inode_return_delegation(struct inode *inode) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int err = 0; if (rcu_access_pointer(nfsi->delegation) != NULL) { - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); - spin_unlock(&clp->cl_lock); + delegation = nfs_detach_delegation(nfsi, server); if (delegation != NULL) { nfs_wb_all(inode); err = __nfs_inode_return_delegation(inode, delegation, 1); @@ -365,46 +408,61 @@ int nfs_inode_return_delegation(struct inode *inode) return err; } -static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation) +static void nfs_mark_return_delegation(struct nfs_delegation *delegation) { + struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client; + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } -/* - * Return all delegations associated to a super block +/** + * nfs_super_return_all_delegations - return delegations for one superblock + * @sb: sb to process + * */ void nfs_super_return_all_delegations(struct super_block *sb) { - struct nfs_client *clp = NFS_SB(sb)->nfs_client; + struct nfs_server *server = NFS_SB(sb); + struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; if (clp == NULL) return; + rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { spin_lock(&delegation->lock); - if (delegation->inode != NULL && delegation->inode->i_sb == sb) - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); spin_unlock(&delegation->lock); } rcu_read_unlock(); + if (nfs_client_return_marked_delegations(clp) != 0) nfs4_schedule_state_manager(clp); } -static -void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) +static void nfs_mark_return_all_delegation_types(struct nfs_server *server, + fmode_t flags) { struct nfs_delegation *delegation; - rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(clp, delegation); + nfs_mark_return_delegation(delegation); } +} + +static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, + fmode_t flags) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_all_delegation_types(server, flags); rcu_read_unlock(); } @@ -419,19 +477,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } +/** + * nfs_expire_all_delegation_types + * @clp: client to process + * @flags: delegation types to expire + * + */ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) { nfs_client_mark_return_all_delegation_types(clp, flags); nfs_delegation_run_state_manager(clp); } +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ void nfs_expire_all_delegations(struct nfs_client *clp) { nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); } -/* - * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. +/** + * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * */ void nfs_handle_cb_pathdown(struct nfs_client *clp) { @@ -440,29 +511,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp) nfs_client_mark_return_all_delegations(clp); } -static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp) +static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; - rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(clp, delegation); + nfs_mark_return_delegation(delegation); } - rcu_read_unlock(); } +/** + * nfs_expire_unreferenced_delegations - Eliminate unused delegations + * @clp: nfs_client to process + * + */ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) { - nfs_client_mark_return_unreferenced_delegations(clp); + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unreferenced_delegations(server); + rcu_read_unlock(); + nfs_delegation_run_state_manager(clp); } -/* - * Asynchronous delegation recall! +/** + * nfs_async_inode_return_delegation - asynchronously return a delegation + * @inode: inode to process + * @stateid: state ID information from CB_RECALL arguments + * + * Returns zero on success, or a negative errno value. */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) +int nfs_async_inode_return_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; @@ -474,22 +559,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s rcu_read_unlock(); return -ENOENT; } - - nfs_mark_return_delegation(clp, delegation); + nfs_mark_return_delegation(delegation); rcu_read_unlock(); + nfs_delegation_run_state_manager(clp); return 0; } -/* - * Retrieve the inode associated with a delegation - */ -struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle) +static struct inode * +nfs_delegation_find_inode_server(struct nfs_server *server, + const struct nfs_fh *fhandle) { struct nfs_delegation *delegation; struct inode *res = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { spin_lock(&delegation->lock); if (delegation->inode != NULL && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { @@ -499,49 +583,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs if (res != NULL) break; } + return res; +} + +/** + * nfs_delegation_find_inode - retrieve the inode associated with a delegation + * @clp: client state handle + * @fhandle: filehandle from a delegation recall + * + * Returns pointer to inode matching "fhandle," or NULL if a matching inode + * cannot be found. + */ +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, + const struct nfs_fh *fhandle) +{ + struct nfs_server *server; + struct inode *res = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + res = nfs_delegation_find_inode_server(server, fhandle); + if (res != NULL) + break; + } rcu_read_unlock(); return res; } -/* - * Mark all delegations as needing to be reclaimed +static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +} + +/** + * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed + * @clp: nfs_client to process + * */ void nfs_delegation_mark_reclaim(struct nfs_client *clp) { - struct nfs_delegation *delegation; + struct nfs_server *server; + rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) - set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_reclaim_server(server); rcu_read_unlock(); } -/* - * Reap all unclaimed delegations after reboot recovery is done +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * */ void nfs_delegation_reap_unclaimed(struct nfs_client *clp) { struct nfs_delegation *delegation; + struct nfs_server *server; struct inode *inode; + restart: rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) - continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) - continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) - nfs_free_delegation(delegation); - iput(inode); - goto restart; + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + server); + rcu_read_unlock(); + + if (delegation != NULL) + nfs_free_delegation(delegation); + iput(inode); + goto restart; + } } rcu_read_unlock(); } +/** + * nfs_delegations_present - check for existence of delegations + * @clp: client state handle + * + * Returns one if there are any nfs_delegation structures attached + * to this nfs_client. + */ +int nfs_delegations_present(struct nfs_client *clp) +{ + struct nfs_server *server; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (!list_empty(&server->delegations)) { + ret = 1; + break; + } + rcu_read_unlock(); + return ret; +} + +/** + * nfs4_copy_delegation_stateid - Copy inode's state ID information + * @dst: stateid data structure to fill in + * @inode: inode to check + * + * Returns one and fills in "dst->data" * if inode had a delegation, + * otherwise zero is returned. + */ int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 2026304bda1..d9322e490c5 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); void nfs_handle_cb_pathdown(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_delegations_present(struct nfs_client *clp); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 996dd8989a9..2c3eb33b904 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,8 +33,8 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/sched.h> -#include <linux/vmalloc.h> #include <linux/kmemleak.h> +#include <linux/xattr.h> #include "delegation.h" #include "iostat.h" @@ -125,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .getxattr = nfs4_getxattr, - .setxattr = nfs4_setxattr, - .listxattr = nfs4_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, }; #endif /* CONFIG_NFS_V4 */ @@ -172,7 +173,7 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; -typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); typedef struct { struct file *file; struct page *page; @@ -378,14 +379,14 @@ error: return error; } -/* Fill in an entry based on the xdr code stored in desc->page */ -static -int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream) +static int xdr_decode(nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct xdr_stream *xdr) { - __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus); - if (IS_ERR(p)) - return PTR_ERR(p); + int error; + error = desc->decode(xdr, entry, desc->plus); + if (error) + return error; entry->fattr->time_start = desc->timestamp; entry->fattr->gencount = desc->gencount; return 0; @@ -438,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (dentry == NULL) return; - dentry->d_op = NFS_PROTO(dir)->dentry_ops; inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); if (IS_ERR(inode)) goto out; @@ -459,25 +459,26 @@ out: /* Perform conversion from xdr to cache array */ static int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - void *xdr_page, struct page *page, unsigned int buflen) + struct page **xdr_pages, struct page *page, unsigned int buflen) { struct xdr_stream stream; - struct xdr_buf buf; - __be32 *ptr = xdr_page; + struct xdr_buf buf = { + .pages = xdr_pages, + .page_len = buflen, + .buflen = buflen, + .len = buflen, + }; + struct page *scratch; struct nfs_cache_array *array; unsigned int count = 0; int status; - buf.head->iov_base = xdr_page; - buf.head->iov_len = buflen; - buf.tail->iov_len = 0; - buf.page_base = 0; - buf.page_len = 0; - buf.buflen = buf.head->iov_len; - buf.len = buf.head->iov_len; - - xdr_init_decode(&stream, &buf, ptr); + scratch = alloc_page(GFP_KERNEL); + if (scratch == NULL) + return -ENOMEM; + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); do { status = xdr_decode(desc, entry, &stream); @@ -506,6 +507,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en } else status = PTR_ERR(array); } + + put_page(scratch); return status; } @@ -521,7 +524,6 @@ static void nfs_readdir_free_large_page(void *ptr, struct page **pages, unsigned int npages) { - vm_unmap_ram(ptr, npages); nfs_readdir_free_pagearray(pages, npages); } @@ -530,9 +532,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages, * to nfs_readdir_free_large_page */ static -void *nfs_readdir_large_page(struct page **pages, unsigned int npages) +int nfs_readdir_large_page(struct page **pages, unsigned int npages) { - void *ptr; unsigned int i; for (i = 0; i < npages; i++) { @@ -541,13 +542,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages) goto out_freepages; pages[i] = page; } + return 0; - ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL); - if (!IS_ERR_OR_NULL(ptr)) - return ptr; out_freepages: nfs_readdir_free_pagearray(pages, i); - return NULL; + return -ENOMEM; } static @@ -566,6 +565,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, entry.eof = 0; entry.fh = nfs_alloc_fhandle(); entry.fattr = nfs_alloc_fattr(); + entry.server = NFS_SERVER(inode); if (entry.fh == NULL || entry.fattr == NULL) goto out; @@ -577,8 +577,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; - pages_ptr = nfs_readdir_large_page(pages, array_size); - if (!pages_ptr) + status = nfs_readdir_large_page(pages, array_size); + if (status < 0) goto out_release_array; do { unsigned int pglen; @@ -587,7 +587,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (status < 0) break; pglen = status; - status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen); + status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); if (status < 0) { if (status == -ENOSPC) status = 0; @@ -938,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) * component of the path. * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. */ -static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) +static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, + unsigned int mask) { if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) return 0; @@ -969,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) { struct nfs_server *server = NFS_SERVER(inode); - if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) + if (IS_AUTOMOUNT(inode)) return 0; if (nd != NULL) { /* VFS wants an on-the-wire revalidation */ @@ -1018,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) +static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir; struct inode *inode; @@ -1027,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) struct nfs_fattr *fattr = NULL; int error; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + parent = dget_parent(dentry); dir = parent->d_inode; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); @@ -1117,7 +1121,7 @@ out_error: /* * This is called from dput() when d_count is going to 0. */ -static int nfs_dentry_delete(struct dentry *dentry) +static int nfs_dentry_delete(const struct dentry *dentry) { dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -1169,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, }; static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) @@ -1188,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru if (dentry->d_name.len > NFS_SERVER(dir)->namelen) goto out; - dentry->d_op = NFS_PROTO(dir)->dentry_ops; - /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. @@ -1217,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru goto out_unblock_sillyrename; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr); - res = (struct dentry *)inode; + res = ERR_CAST(inode); if (IS_ERR(res)) goto out_unblock_sillyrename; @@ -1244,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs_open_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, }; /* @@ -1333,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = ERR_PTR(-ENAMETOOLONG); goto out; } - dentry->d_op = NFS_PROTO(dir)->dentry_ops; /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash * the dentry. */ @@ -1351,8 +1354,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; attr.ia_valid = ATTR_MODE; - if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current_umask(); + attr.ia_mode &= ~current_umask(); } else { open_flags &= ~(O_EXCL | O_CREAT); attr.ia_valid = 0; @@ -1406,11 +1408,15 @@ no_open: static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) { struct dentry *parent = NULL; - struct inode *inode = dentry->d_inode; + struct inode *inode; struct inode *dir; struct nfs_open_context *ctx; int openflags, ret = 0; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; if (!is_atomic_open(nd) || d_mountpoint(dentry)) goto no_open; @@ -1579,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, { struct iattr attr; int error; + int open_flags = 0; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1586,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); + if ((nd->flags & LOOKUP_CREATE) != 0) + open_flags = nd->intent.open.flags; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); if (error != 0) goto out_err; return 0; @@ -1718,11 +1728,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1) { + if (dentry->d_count > 1) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); @@ -1733,7 +1741,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) need_rehash = 1; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); error = nfs_safe_remove(dentry); if (!error || error == -ENOENT) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -1868,7 +1875,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, - atomic_read(&new_dentry->d_count)); + new_dentry->d_count); /* * For non-directories, check whether the target is busy and if so, @@ -1886,7 +1893,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, rehash = new_dentry; } - if (atomic_read(&new_dentry->d_count) > 2) { + if (new_dentry->d_count > 2) { int err; /* copy the target dentry's name */ @@ -2188,11 +2195,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } -int nfs_permission(struct inode *inode, int mask) +int nfs_permission(struct inode *inode, int mask, unsigned int flags) { struct rpc_cred *cred; int res = 0; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + nfs_inc_stats(inode, NFSIOS_VFSACCESS); if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) @@ -2240,7 +2250,7 @@ out: out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) - res = generic_permission(inode, mask, NULL); + res = generic_permission(inode, mask, flags, NULL); goto out; } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index ac7b814ce16..b5ffe8fa291 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -63,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&dcache_lock); + spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&sb->s_root->d_lock); list_del_init(&sb->s_root->d_alias); - spin_unlock(&dcache_lock); + spin_unlock(&sb->s_root->d_lock); + spin_unlock(&sb->s_root->d_inode->i_lock); } return 0; } @@ -117,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) } security_d_instantiate(ret, inode); - - if (ret->d_op == NULL) - ret->d_op = server->nfs_client->rpc_ops->dentry_ops; out: nfs_free_fattr(fsinfo.fattr); return ret; @@ -225,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) security_d_instantiate(ret, inode); - if (ret->d_op == NULL) - ret->d_op = server->nfs_client->rpc_ops->dentry_ops; - out: nfs_free_fattr(fattr); dprintk("<-- nfs4_get_root()\n"); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 4e2d9b6b138..18696882f1c 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -238,7 +238,7 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu return nfs_idmap_lookup_name(gid, "group", buf, buflen); } -#else /* CONFIG_NFS_USE_IDMAPPER not defined */ +#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ #include <linux/module.h> #include <linux/mutex.h> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e67e31c7341..d8512423ba7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else inode->i_op = &nfs_mountpoint_inode_operations; inode->i_fop = NULL; - set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); + inode->i_flags |= S_AUTOMOUNT; } } else if (S_ISLNK(inode->i_mode)) inode->i_op = &nfs_symlink_inode_operations; @@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && - !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) + !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; /* @@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ void nfs4_evict_inode(struct inode *inode) { + pnfs_destroy_layout(NFS_I(inode)); truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); - pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); /* First call standard NFS clear_inode() code */ @@ -1438,11 +1438,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return &nfsi->vfs_inode; } -void nfs_destroy_inode(struct inode *inode) +static void nfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} + static inline void nfs4_init_once(struct nfs_inode *nfsi) { #ifdef CONFIG_NFS_V4 @@ -1612,6 +1619,7 @@ static void __exit exit_nfs_fs(void) #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif + nfs_cleanup_cb_ident_idr(); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e6356b750b7..4644f04b4b4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -128,9 +128,13 @@ extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern struct rpc_program nfs_program; +extern void nfs_cleanup_cb_ident_idr(void); extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); -extern struct nfs_client *nfs_find_client_next(struct nfs_client *); +extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); +extern struct nfs_client *nfs4_find_client_ident(int); +extern struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *, + int); extern struct nfs_server *nfs_create_server( const struct nfs_parsed_mount_data *, struct nfs_fh *); @@ -185,17 +189,20 @@ extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); /* nfs2xdr.c */ -extern int nfs_stat_to_errno(int); +extern int nfs_stat_to_errno(enum nfs_stat); extern struct rpc_procinfo nfs_procedures[]; -extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +extern int nfs2_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); /* nfs3xdr.c */ extern struct rpc_procinfo nfs3_procedures[]; -extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +extern int nfs3_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); /* nfs4xdr.c */ #ifdef CONFIG_NFS_V4 -extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +extern int nfs4_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; @@ -245,6 +252,7 @@ extern char *nfs_path(const char *base, const struct dentry *droot, const struct dentry *dentry, char *buffer, ssize_t buflen); +extern struct vfsmount *nfs_d_automount(struct path *path); /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 4f981f1f668..d4c2d6b7507 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -236,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info) .authflavor = RPC_AUTH_UNIX, .flags = RPC_CLNT_CREATE_NOPING, }; - struct mountres result; struct rpc_message msg = { .rpc_argp = info->dirpath, - .rpc_resp = &result, }; struct rpc_clnt *clnt; int status; @@ -248,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; clnt = rpc_create(&args); - if (unlikely(IS_ERR(clnt))) + if (IS_ERR(clnt)) goto out_clnt_err; dprintk("NFS: sending UMNT request for %s:%s\n", @@ -280,29 +278,20 @@ out_call_err: * XDR encode/decode functions for MOUNT */ -static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) +static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) { const u32 pathname_len = strlen(pathname); __be32 *p; - if (unlikely(pathname_len > MNTPATHLEN)) - return -EIO; - - p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len); - if (unlikely(p == NULL)) - return -EIO; + BUG_ON(pathname_len > MNTPATHLEN); + p = xdr_reserve_space(xdr, 4 + pathname_len); xdr_encode_opaque(p, pathname, pathname_len); - - return 0; } -static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p, - const char *dirpath) +static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr, + const char *dirpath) { - struct xdr_stream xdr; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - return encode_mntdirpath(&xdr, dirpath); + encode_mntdirpath(xdr, dirpath); } /* @@ -320,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res) u32 status; __be32 *p; - p = xdr_inline_decode(xdr, sizeof(status)); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; - status = ntohl(*p); + status = be32_to_cpup(p); for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { if (mnt_errtbl[i].status == status) { @@ -351,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res) return 0; } -static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p, - struct mountres *res) +static int mnt_xdr_dec_mountres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) { - struct xdr_stream xdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - - status = decode_status(&xdr, res); + status = decode_status(xdr, res); if (unlikely(status != 0 || res->errno != 0)) return status; - return decode_fhandle(&xdr, res); + return decode_fhandle(xdr, res); } static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) @@ -371,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) u32 status; __be32 *p; - p = xdr_inline_decode(xdr, sizeof(status)); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; - status = ntohl(*p); + status = be32_to_cpup(p); for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { if (mnt3_errtbl[i].status == status) { @@ -394,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res) u32 size; __be32 *p; - p = xdr_inline_decode(xdr, sizeof(size)); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; - size = ntohl(*p++); + size = be32_to_cpup(p); if (size > NFS3_FHSIZE || size == 0) return -EIO; @@ -421,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) if (*count == 0) return 0; - p = xdr_inline_decode(xdr, sizeof(entries)); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return -EIO; - entries = ntohl(*p); + entries = be32_to_cpup(p); dprintk("NFS: received %u auth flavors\n", entries); if (entries > NFS_MAX_SECFLAVORS) entries = NFS_MAX_SECFLAVORS; - p = xdr_inline_decode(xdr, sizeof(u32) * entries); + p = xdr_inline_decode(xdr, 4 * entries); if (unlikely(p == NULL)) return -EIO; @@ -437,7 +424,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) entries = *count; for (i = 0; i < entries; i++) { - flavors[i] = ntohl(*p++); + flavors[i] = be32_to_cpup(p++); dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); } *count = i; @@ -445,30 +432,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) return 0; } -static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p, - struct mountres *res) +static int mnt_xdr_dec_mountres3(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) { - struct xdr_stream xdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - - status = decode_fhs_status(&xdr, res); + status = decode_fhs_status(xdr, res); if (unlikely(status != 0 || res->errno != 0)) return status; - status = decode_fhandle3(&xdr, res); + status = decode_fhandle3(xdr, res); if (unlikely(status != 0)) { res->errno = -EBADHANDLE; return 0; } - return decode_auth_flavors(&xdr, res); + return decode_auth_flavors(xdr, res); } static struct rpc_procinfo mnt_procedures[] = { [MOUNTPROC_MNT] = { .p_proc = MOUNTPROC_MNT, - .p_encode = (kxdrproc_t)mnt_enc_dirpath, - .p_decode = (kxdrproc_t)mnt_dec_mountres, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres, .p_arglen = MNT_enc_dirpath_sz, .p_replen = MNT_dec_mountres_sz, .p_statidx = MOUNTPROC_MNT, @@ -476,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = { }, [MOUNTPROC_UMNT] = { .p_proc = MOUNTPROC_UMNT, - .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, .p_arglen = MNT_enc_dirpath_sz, .p_statidx = MOUNTPROC_UMNT, .p_name = "UMOUNT", @@ -486,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = { static struct rpc_procinfo mnt3_procedures[] = { [MOUNTPROC3_MNT] = { .p_proc = MOUNTPROC3_MNT, - .p_encode = (kxdrproc_t)mnt_enc_dirpath, - .p_decode = (kxdrproc_t)mnt_dec_mountres3, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3, .p_arglen = MNT_enc_dirpath_sz, .p_replen = MNT_dec_mountres3_sz, .p_statidx = MOUNTPROC3_MNT, @@ -495,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = { }, [MOUNTPROC3_UMNT] = { .p_proc = MOUNTPROC3_UMNT, - .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, .p_arglen = MNT_enc_dirpath_sz, .p_statidx = MOUNTPROC3_UMNT, .p_name = "UMOUNT", diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index db6aa3673cf..f32b8603dca 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -49,12 +49,17 @@ char *nfs_path(const char *base, const struct dentry *dentry, char *buffer, ssize_t buflen) { - char *end = buffer+buflen; + char *end; int namelen; + unsigned seq; +rename_retry: + end = buffer+buflen; *--end = '\0'; buflen--; - spin_lock(&dcache_lock); + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); while (!IS_ROOT(dentry) && dentry != droot) { namelen = dentry->d_name.len; buflen -= namelen + 1; @@ -65,7 +70,9 @@ char *nfs_path(const char *base, *--end = '/'; dentry = dentry->d_parent; } - spin_unlock(&dcache_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; if (*end != '/') { if (--buflen < 0) goto Elong; @@ -82,15 +89,16 @@ char *nfs_path(const char *base, memcpy(end, base, namelen); return end; Elong_unlock: - spin_unlock(&dcache_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; Elong: return ERR_PTR(-ENAMETOOLONG); } /* - * nfs_follow_mountpoint - handle crossing a mountpoint on the server - * @dentry - dentry of mountpoint - * @nd - nameidata info + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint * * When we encounter a mountpoint on the server, we want to set up * a mountpoint on the client too, to prevent inode numbers from @@ -100,87 +108,65 @@ Elong: * situation, and that different filesystems may want to use * different security flavours. */ -static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); struct dentry *parent; struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; int err; - dprintk("--> nfs_follow_mountpoint()\n"); + dprintk("--> nfs_d_automount()\n"); - err = -ESTALE; - if (IS_ROOT(dentry)) - goto out_err; + mnt = ERR_PTR(-ESTALE); + if (IS_ROOT(path->dentry)) + goto out_nofree; - err = -ENOMEM; + mnt = ERR_PTR(-ENOMEM); fh = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr(); if (fh == NULL || fattr == NULL) - goto out_err; + goto out; dprintk("%s: enter\n", __func__); - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); - /* Look it up again */ - parent = dget_parent(nd->path.dentry); + /* Look it up again to get its attributes */ + parent = dget_parent(path->dentry); err = server->nfs_client->rpc_ops->lookup(parent->d_inode, - &nd->path.dentry->d_name, + &path->dentry->d_name, fh, fattr); dput(parent); - if (err != 0) - goto out_err; + if (err != 0) { + mnt = ERR_PTR(err); + goto out; + } if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) - mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); + mnt = nfs_do_refmount(path->mnt, path->dentry); else - mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, - fattr); - err = PTR_ERR(mnt); + mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr); if (IS_ERR(mnt)) - goto out_err; + goto out; - mntget(mnt); - err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, - &nfs_automount_list); - if (err < 0) { - mntput(mnt); - if (err == -EBUSY) - goto out_follow; - goto out_err; - } - path_put(&nd->path); - nd->path.mnt = mnt; - nd->path.dentry = dget(mnt->mnt_root); + dprintk("%s: done, success\n", __func__); + mntget(mnt); /* prevent immediate expiration */ + mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); - dprintk("%s: done, returned %d\n", __func__, err); - - dprintk("<-- nfs_follow_mountpoint() = %d\n", err); - return ERR_PTR(err); -out_err: - path_put(&nd->path); - goto out; -out_follow: - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) - ; - err = 0; - goto out; +out_nofree: + dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); + return mnt; } const struct inode_operations nfs_mountpoint_inode_operations = { - .follow_link = nfs_follow_mountpoint, .getattr = nfs_getattr, }; const struct inode_operations nfs_referral_inode_operations = { - .follow_link = nfs_follow_mountpoint, }; static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 5914a1911c9..792cb13a430 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -61,584 +61,1008 @@ #define NFS_readdirres_sz (1) #define NFS_statfsres_sz (1+NFS_info_sz) + /* - * Common NFS XDR functions as inlines + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. */ -static inline __be32 * -xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle) +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) { - memcpy(p, fhandle->data, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); } -static inline __be32 * -xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle) +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) { - /* NFSv2 handles have a fixed length */ - fhandle->size = NFS2_FHSIZE; - memcpy(fhandle->data, p, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NFSv2 basic data types + * + * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: + * "NFS: Network File System Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +/* + * typedef opaque nfsdata<>; + */ +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +{ + u32 recvd, count; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, count); + result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */ + result->count = count; + return count; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * enum stat { + * NFS_OK = 0, + * NFSERR_PERM = 1, + * NFSERR_NOENT = 2, + * NFSERR_IO = 5, + * NFSERR_NXIO = 6, + * NFSERR_ACCES = 13, + * NFSERR_EXIST = 17, + * NFSERR_NODEV = 19, + * NFSERR_NOTDIR = 20, + * NFSERR_ISDIR = 21, + * NFSERR_FBIG = 27, + * NFSERR_NOSPC = 28, + * NFSERR_ROFS = 30, + * NFSERR_NAMETOOLONG = 63, + * NFSERR_NOTEMPTY = 66, + * NFSERR_DQUOT = 69, + * NFSERR_STALE = 70, + * NFSERR_WFLUSH = 99 + * }; + */ +static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static inline __be32* -xdr_encode_time(__be32 *p, struct timespec *timep) +/* + * 2.3.2. ftype + * + * enum ftype { + * NFNON = 0, + * NFREG = 1, + * NFDIR = 2, + * NFBLK = 3, + * NFCHR = 4, + * NFLNK = 5 + * }; + * + */ +static __be32 *xdr_decode_ftype(__be32 *p, u32 *type) { - *p++ = htonl(timep->tv_sec); - /* Convert nanoseconds into microseconds */ - *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0); + *type = be32_to_cpup(p++); + if (unlikely(*type > NF2FIFO)) + *type = NFBAD; return p; } -static inline __be32* -xdr_encode_current_server_time(__be32 *p, struct timespec *timep) +/* + * 2.3.3. fhandle + * + * typedef opaque fhandle[FHSIZE]; + */ +static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh) { - /* - * Passing the invalid value useconds=1000000 is a - * Sun convention for "set to current server time". - * It's needed to make permissions checks for the - * "touch" program across v2 mounts to Solaris and - * Irix boxes work correctly. See description of - * sattr in section 6.1 of "NFS Illustrated" by - * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 - */ - *p++ = htonl(timep->tv_sec); - *p++ = htonl(1000000); + __be32 *p; + + BUG_ON(fh->size != NFS2_FHSIZE); + p = xdr_reserve_space(xdr, NFS2_FHSIZE); + memcpy(p, fh->data, NFS2_FHSIZE); +} + +static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.4. timeval + * + * struct timeval { + * unsigned int seconds; + * unsigned int useconds; + * }; + */ +static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + if (timep->tv_nsec != 0) + *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); + else + *p++ = cpu_to_be32(0); return p; } -static inline __be32* -xdr_decode_time(__be32 *p, struct timespec *timep) +/* + * Passing the invalid value useconds=1000000 is a Sun convention for + * "set to current server time". It's needed to make permissions checks + * for the "touch" program across v2 mounts to Solaris and Irix servers + * work correctly. See description of sattr in section 6.1 of "NFS + * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. + */ +static __be32 *xdr_encode_current_server_time(__be32 *p, + const struct timespec *timep) { - timep->tv_sec = ntohl(*p++); - /* Convert microseconds into nanoseconds */ - timep->tv_nsec = ntohl(*p++) * 1000; + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(1000000); return p; } -static __be32 * -xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) +static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; + return p; +} + +/* + * 2.3.5. fattr + * + * struct fattr { + * ftype type; + * unsigned int mode; + * unsigned int nlink; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * unsigned int blocksize; + * unsigned int rdev; + * unsigned int blocks; + * unsigned int fsid; + * unsigned int fileid; + * timeval atime; + * timeval mtime; + * timeval ctime; + * }; + * + */ +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) { u32 rdev, type; - type = ntohl(*p++); - fattr->mode = ntohl(*p++); - fattr->nlink = ntohl(*p++); - fattr->uid = ntohl(*p++); - fattr->gid = ntohl(*p++); - fattr->size = ntohl(*p++); - fattr->du.nfs2.blocksize = ntohl(*p++); - rdev = ntohl(*p++); - fattr->du.nfs2.blocks = ntohl(*p++); - fattr->fsid.major = ntohl(*p++); - fattr->fsid.minor = 0; - fattr->fileid = ntohl(*p++); - p = xdr_decode_time(p, &fattr->atime); - p = xdr_decode_time(p, &fattr->mtime); - p = xdr_decode_time(p, &fattr->ctime); + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + fattr->valid |= NFS_ATTR_FATTR_V2; + + p = xdr_decode_ftype(p, &type); + + fattr->mode = be32_to_cpup(p++); + fattr->nlink = be32_to_cpup(p++); + fattr->uid = be32_to_cpup(p++); + fattr->gid = be32_to_cpup(p++); + fattr->size = be32_to_cpup(p++); + fattr->du.nfs2.blocksize = be32_to_cpup(p++); + + rdev = be32_to_cpup(p++); fattr->rdev = new_decode_dev(rdev); - if (type == NFCHR && rdev == NFS2_FIFO_DEV) { + if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) { fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } + + fattr->du.nfs2.blocks = be32_to_cpup(p++); + fattr->fsid.major = be32_to_cpup(p++); + fattr->fsid.minor = 0; + fattr->fileid = be32_to_cpup(p++); + + p = xdr_decode_time(p, &fattr->atime); + p = xdr_decode_time(p, &fattr->mtime); + xdr_decode_time(p, &fattr->ctime); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.6. sattr + * + * struct sattr { + * unsigned int mode; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * timeval atime; + * timeval mtime; + * }; + */ + +#define NFS2_SATTR_NOT_SET (0xffffffff) + +static __be32 *xdr_time_not_set(__be32 *p) +{ + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); return p; } -static inline __be32 * -xdr_encode_sattr(__be32 *p, struct iattr *attr) +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) { - const __be32 not_set = __constant_htonl(0xFFFFFFFF); + __be32 *p; - *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; - *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set; - *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set; - *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set; + p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); - if (attr->ia_valid & ATTR_ATIME_SET) { + if (attr->ia_valid & ATTR_MODE) + *p++ = cpu_to_be32(attr->ia_mode); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_UID) + *p++ = cpu_to_be32(attr->ia_uid); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_GID) + *p++ = cpu_to_be32(attr->ia_gid); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_SIZE) + *p++ = cpu_to_be32((u32)attr->ia_size); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + + if (attr->ia_valid & ATTR_ATIME_SET) p = xdr_encode_time(p, &attr->ia_atime); - } else if (attr->ia_valid & ATTR_ATIME) { + else if (attr->ia_valid & ATTR_ATIME) p = xdr_encode_current_server_time(p, &attr->ia_atime); - } else { - *p++ = not_set; - *p++ = not_set; - } - - if (attr->ia_valid & ATTR_MTIME_SET) { - p = xdr_encode_time(p, &attr->ia_mtime); - } else if (attr->ia_valid & ATTR_MTIME) { - p = xdr_encode_current_server_time(p, &attr->ia_mtime); - } else { - *p++ = not_set; - *p++ = not_set; - } - return p; + else + p = xdr_time_not_set(p); + if (attr->ia_valid & ATTR_MTIME_SET) + xdr_encode_time(p, &attr->ia_mtime); + else if (attr->ia_valid & ATTR_MTIME) + xdr_encode_current_server_time(p, &attr->ia_mtime); + else + xdr_time_not_set(p); } /* - * NFS encode functions + * 2.3.7. filename + * + * typedef string filename<MAXNAMLEN>; */ +static void encode_filename(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS2_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +static int decode_filename_inline(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + /* - * Encode file handle argument - * GETATTR, READLINK, STATFS + * 2.3.8. path + * + * typedef string path<MAXPATHLEN>; */ -static int -nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) +static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) { - p = xdr_encode_fhandle(p, fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + __be32 *p; + + BUG_ON(length > NFS2_MAXPATHLEN); + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(length); + xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_path(struct xdr_stream *xdr) +{ + u32 length, recvd; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p); + if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) + goto out_size; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(length > recvd)) + goto out_cheating; + + xdr_read_pages(xdr, length); + xdr_terminate_string(xdr->buf, length); return 0; +out_size: + dprintk("NFS: returned pathname too long: %u\n", length); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "length %u > received %u\n", length, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode SETATTR arguments + * 2.3.9. attrstat + * + * union attrstat switch (stat status) { + * case NFS_OK: + * fattr attributes; + * default: + * void; + * }; */ -static int -nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args) +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } /* - * Encode directory ops argument - * LOOKUP, RMDIR + * 2.3.10. diropargs + * + * struct diropargs { + * fhandle dir; + * filename name; + * }; */ -static int -nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args) +static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_fhandle(xdr, fh); + encode_filename(xdr, name, length); } /* - * Encode REMOVE argument + * 2.3.11. diropres + * + * union diropres switch (stat status) { + * case NFS_OK: + * struct { + * fhandle file; + * fattr attributes; + * } diropok; + * default: + * void; + * }; */ -static int -nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name.name, args->name.len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + int error; + + error = decode_fhandle(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_fattr(xdr, result->fattr); +out: + return error; +} + +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_diropok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } + /* - * Arguments to a READ call. Since we read data directly into the page - * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page we want to fetch. + * NFSv2 XDR encode functions + * + * NFSv2 argument types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". */ -static int -nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) + +static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - u32 offset = (u32)args->offset; + encode_fhandle(xdr, fh); +} + +/* + * 2.2.3. sattrargs + * + * struct sattrargs { + * fhandle file; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_sattrargs *args) +{ + encode_fhandle(xdr, args->fh); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_diropargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); +} + +static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readlinkargs *args) +{ + encode_fhandle(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); +} + +/* + * 2.2.7. readargs + * + * struct readargs { + * fhandle file; + * unsigned offset; + * unsigned count; + * unsigned totalcount; + * }; + */ +static void encode_readargs(struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + u32 offset = args->offset; u32 count = args->count; + __be32 *p; - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(offset); - *p++ = htonl(count); - *p++ = htonl(count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + encode_fhandle(xdr, args->fh); - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, - args->pages, args->pgbase, count); + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + *p = cpu_to_be32(count); +} + +static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + encode_readargs(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; - return 0; } /* - * Decode READ reply + * 2.2.9. writeargs + * + * struct writeargs { + * fhandle file; + * unsigned beginoffset; + * unsigned offset; + * unsigned totalcount; + * nfsdata data; + * }; */ -static int -nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) +static void encode_writeargs(struct xdr_stream *xdr, + const struct nfs_writeargs *args) { - struct kvec *iov = req->rq_rcv_buf.head; - size_t hdrlen; - u32 count, recvd; - int status; - - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - p = xdr_decode_fattr(p, res->fattr); - - count = ntohl(*p++); - res->eof = 0; - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READ reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READ header is short. iovec will be shifted.\n"); - xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); - } + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; - recvd = req->rq_rcv_buf.len - hdrlen; - if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - count = recvd; - } + encode_fhandle(xdr, args->fh); - dprintk("RPC: readres OK count %u\n", count); - if (count < res->count) - res->count = count; + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); - return count; + /* nfsdata */ + *p = cpu_to_be32(count); + xdr_write_pages(xdr, args->pages, args->pgbase, count); } +static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + encode_writeargs(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; +} /* - * Write arguments. Splice the buffer to be written into the iovec. + * 2.2.10. createargs + * + * struct createargs { + * diropargs where; + * sattr attributes; + * }; */ -static int -nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_createargs *args) { - struct xdr_buf *sndbuf = &req->rq_snd_buf; - u32 offset = (u32)args->offset; - u32 count = args->count; - - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(offset); - *p++ = htonl(offset); - *p++ = htonl(count); - *p++ = htonl(count); - sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); + encode_diropargs(xdr, args->fh, args->name, args->len); + encode_sattr(xdr, args->sattr); +} - /* Copy the page array */ - xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); - sndbuf->flags |= XDRBUF_WRITE; - return 0; +static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs(xdr, args->fh, args->name.name, args->name.len); } /* - * Encode create arguments - * CREATE, MKDIR + * 2.2.12. renameargs + * + * struct renameargs { + * diropargs from; + * diropargs to; + * }; */ -static int -nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) +static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs(xdr, args->old_dir, old->name, old->len); + encode_diropargs(xdr, args->new_dir, new->name, new->len); } /* - * Encode RENAME arguments + * 2.2.13. linkargs + * + * struct linkargs { + * fhandle from; + * diropargs to; + * }; */ -static int -nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) +static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_linkargs *args) { - p = xdr_encode_fhandle(p, args->old_dir); - p = xdr_encode_array(p, args->old_name->name, args->old_name->len); - p = xdr_encode_fhandle(p, args->new_dir); - p = xdr_encode_array(p, args->new_name->name, args->new_name->len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_fhandle(xdr, args->fromfh); + encode_diropargs(xdr, args->tofh, args->toname, args->tolen); } /* - * Encode LINK arguments + * 2.2.14. symlinkargs + * + * struct symlinkargs { + * diropargs from; + * path to; + * sattr attributes; + * }; */ -static int -nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args) +static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_symlinkargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); + encode_path(xdr, args->pages, args->pathlen); + encode_sattr(xdr, args->sattr); } /* - * Encode SYMLINK arguments + * 2.2.17. readdirargs + * + * struct readdirargs { + * fhandle dir; + * nfscookie cookie; + * unsigned count; + * }; */ -static int -nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args) +static void encode_readdirargs(struct xdr_stream *xdr, + const struct nfs_readdirargs *args) { - struct xdr_buf *sndbuf = &req->rq_snd_buf; - size_t pad; + __be32 *p; - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - *p++ = htonl(args->pathlen); - sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); + encode_fhandle(xdr, args->fh); - xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); + p = xdr_reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(args->cookie); + *p = cpu_to_be32(args->count); +} - /* - * xdr_encode_pages may have added a few bytes to ensure the - * pathname ends on a 4-byte boundary. Start encoding the - * attributes after the pad bytes. - */ - pad = sndbuf->tail->iov_len; - if (pad > 0) - p++; - p = xdr_encode_sattr(p, args->sattr); - sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad; - return 0; +static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + encode_readdirargs(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS_readdirres_sz); } /* - * Encode arguments to readdir call + * NFSv2 XDR decode functions + * + * NFSv2 result types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". */ -static int -nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) + +static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - u32 count = args->count; + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->cookie); - *p++ = htonl(count); /* see above */ - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + return decode_attrstat(xdr, result); +} - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); - return 0; +static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_diropok *result) +{ + return decode_diropres(xdr, result); } /* - * Decode the result of a readdir call. - * We're not really decoding anymore, we just leave the buffer untouched - * and only check that it is syntactically correct. - * The real decoding happens in nfs_decode_entry below, called directly - * from nfs_readdir for each entry. + * 2.2.6. readlinkres + * + * union readlinkres switch (stat status) { + * case NFS_OK: + * path data; + * default: + * void; + * }; */ -static int -nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) +static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - struct page **page; - size_t hdrlen; - unsigned int pglen, recvd; - int status; - - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READDIR reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_path(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} - pglen = rcvbuf->page_len; - recvd = rcvbuf->len - hdrlen; - if (pglen > recvd) - pglen = recvd; - page = rcvbuf->pages; - return pglen; +/* + * 2.2.7. readres + * + * union readres switch (stat status) { + * case NFS_OK: + * fattr attributes; + * nfsdata data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readres *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_nfsdata(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeres *result) { - dprintk("nfs: %s: prematurely hit end of receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); + /* All NFSv2 writes are "file sync" writes */ + result->verf->committed = NFS_FILE_SYNC; + return decode_attrstat(xdr, result->fattr); } -__be32 * -nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) +/** + * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 2.2.17. entry + * + * struct entry { + * unsigned fileid; + * filename name; + * nfscookie cookie; + * entry *nextentry; + * }; + */ +int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) { __be32 *p; + int error; + p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + if (unlikely(p == NULL)) goto out_overflow; - if (!ntohl(*p++)) { + if (*p++ == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + if (unlikely(p == NULL)) goto out_overflow; - if (!ntohl(*p++)) - return ERR_PTR(-EAGAIN); + if (*p++ == xdr_zero) + return -EAGAIN; entry->eof = 1; - return ERR_PTR(-EBADCOOKIE); + return -EBADCOOKIE; } - p = xdr_inline_decode(xdr, 8); - if (unlikely(!p)) + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) goto out_overflow; + entry->ino = be32_to_cpup(p); - entry->ino = ntohl(*p++); - entry->len = ntohl(*p++); + error = decode_filename_inline(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; - p = xdr_inline_decode(xdr, entry->len + 4); - if (unlikely(!p)) + /* + * The type (size and byte order) of nfscookie isn't defined in + * RFC 1094. This implementation assumes that it's an XDR uint32. + */ + entry->prev_cookie = entry->cookie; + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) goto out_overflow; - entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); - entry->prev_cookie = entry->cookie; - entry->cookie = ntohl(*p++); + entry->cookie = be32_to_cpup(p); entry->d_type = DT_UNKNOWN; - p = xdr_inline_peek(xdr, 8); - if (p != NULL) - entry->eof = !p[0] && p[1]; - else - entry->eof = 0; - - return p; + return 0; out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EAGAIN); -} - -/* - * NFS XDR decode functions - */ -/* - * Decode simple status reply - */ -static int -nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy) -{ - int status; - - if ((status = ntohl(*p++)) != 0) - status = nfs_stat_to_errno(status); - return status; + return -EAGAIN; } /* - * Decode attrstat reply - * GETATTR, SETATTR, WRITE - */ -static int -nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) -{ - int status; - - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - xdr_decode_fattr(p, fattr); - return 0; -} - -/* - * Decode diropres reply - * LOOKUP, CREATE, MKDIR + * 2.2.17. readdirres + * + * union readdirres switch (stat status) { + * case NFS_OK: + * struct { + * entry *entries; + * bool eof; + * } readdirok; + * default: + * void; + * }; + * + * Read the directory contents into the page cache, but don't + * touch them. The actual decoding is done by nfs2_decode_dirent() + * during subsequent nfs_readdir() calls. */ -static int -nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) +static int decode_readdirok(struct xdr_stream *xdr) { - int status; + u32 recvd, pglen; + size_t hdrlen; - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - p = xdr_decode_fhandle(p, res->fh); - xdr_decode_fattr(p, res->fattr); - return 0; + pglen = xdr->buf->page_len; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(pglen > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, pglen); + return pglen; +out_cheating: + dprintk("NFS: server cheating in readdir result: " + "pglen %u > recvd %u\n", pglen, recvd); + pglen = recvd; + goto out; } -/* - * Encode READLINK args - */ -static int -nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) +static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - p = xdr_encode_fhandle(p, args->fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); - return 0; + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_readdirok(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } /* - * Decode READLINK reply + * 2.2.18. statfsres + * + * union statfsres (stat status) { + * case NFS_OK: + * struct { + * unsigned tsize; + * unsigned bsize; + * unsigned blocks; + * unsigned bfree; + * unsigned bavail; + * } info; + * default: + * void; + * }; */ -static int -nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) +static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; - u32 len, recvd; - int status; - - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - /* Convert length of symlink */ - len = ntohl(*p++); - if (len >= rcvbuf->page_len) { - dprintk("nfs: server returned giant symlink!\n"); - return -ENAMETOOLONG; - } - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READLINK reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } - recvd = req->rq_rcv_buf.len - hdrlen; - if (recvd < len) { - dprintk("NFS: server cheating in readlink reply: " - "count %u > recvd %u\n", len, recvd); - return -EIO; - } + __be32 *p; - xdr_terminate_string(rcvbuf, len); + p = xdr_inline_decode(xdr, NFS_info_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + result->tsize = be32_to_cpup(p++); + result->bsize = be32_to_cpup(p++); + result->blocks = be32_to_cpup(p++); + result->bfree = be32_to_cpup(p++); + result->bavail = be32_to_cpup(p); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -/* - * Decode WRITE reply - */ -static int -nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) +static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs2_fsstat *result) { - res->verf->committed = NFS_FILE_SYNC; - return nfs_xdr_attrstat(req, p, res->fattr); + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_info(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } -/* - * Decode STATFS reply - */ -static int -nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res) -{ - int status; - - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - - res->tsize = ntohl(*p++); - res->bsize = ntohl(*p++); - res->blocks = ntohl(*p++); - res->bfree = ntohl(*p++); - res->bavail = ntohl(*p++); - return 0; -} /* * We need to translate between nfs status return values and * the local errno values which may not be the same. */ -static struct { +static const struct { int stat; int errno; } nfs_errtbl[] = { @@ -678,28 +1102,30 @@ static struct { { -1, -EIO } }; -/* - * Convert an NFS error code to a local one. - * This one is used jointly by NFSv2 and NFSv3. +/** + * nfs_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. */ -int -nfs_stat_to_errno(int stat) +int nfs_stat_to_errno(enum nfs_stat status) { int i; for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == stat) + if (nfs_errtbl[i].stat == (int)status) return nfs_errtbl[i].errno; } - dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); + dprintk("NFS: Unrecognized nfs status value: %u\n", status); return nfs_errtbl[i].errno; } #define PROC(proc, argtype, restype, timer) \ [NFSPROC_##proc] = { \ .p_proc = NFSPROC_##proc, \ - .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ + .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \ .p_arglen = NFS_##argtype##_sz, \ .p_replen = NFS_##restype##_sz, \ .p_timer = timer, \ @@ -707,21 +1133,21 @@ nfs_stat_to_errno(int stat) .p_name = #proc, \ } struct rpc_procinfo nfs_procedures[] = { - PROC(GETATTR, fhandle, attrstat, 1), - PROC(SETATTR, sattrargs, attrstat, 0), - PROC(LOOKUP, diropargs, diropres, 2), - PROC(READLINK, readlinkargs, readlinkres, 3), - PROC(READ, readargs, readres, 3), - PROC(WRITE, writeargs, writeres, 4), - PROC(CREATE, createargs, diropres, 0), - PROC(REMOVE, removeargs, stat, 0), - PROC(RENAME, renameargs, stat, 0), - PROC(LINK, linkargs, stat, 0), - PROC(SYMLINK, symlinkargs, stat, 0), - PROC(MKDIR, createargs, diropres, 0), - PROC(RMDIR, diropargs, stat, 0), - PROC(READDIR, readdirargs, readdirres, 3), - PROC(STATFS, fhandle, statfsres, 0), + PROC(GETATTR, fhandle, attrstat, 1), + PROC(SETATTR, sattrargs, attrstat, 0), + PROC(LOOKUP, diropargs, diropres, 2), + PROC(READLINK, readlinkargs, readlinkres, 3), + PROC(READ, readargs, readres, 3), + PROC(WRITE, writeargs, writeres, 4), + PROC(CREATE, createargs, diropres, 0), + PROC(REMOVE, removeargs, stat, 0), + PROC(RENAME, renameargs, stat, 0), + PROC(LINK, linkargs, stat, 0), + PROC(SYMLINK, symlinkargs, stat, 0), + PROC(MKDIR, createargs, diropres, 0), + PROC(RMDIR, diropargs, stat, 0), + PROC(READDIR, readdirargs, readdirres, 3), + PROC(STATFS, fhandle, statfsres, 0), }; struct rpc_version nfs_version2 = { diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index f6cc60f06da..01c5e8b1941 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -37,18 +37,16 @@ #define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) #define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) #define NFS3_fattr_sz (21) -#define NFS3_wcc_attr_sz (6) +#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2) +#define NFS3_wcc_attr_sz (6) #define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) #define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) -#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) -#define NFS3_fsstat_sz -#define NFS3_fsinfo_sz -#define NFS3_pathconf_sz -#define NFS3_entry_sz (NFS3_filename_sz+3) - -#define NFS3_sattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) #define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) -#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) + +#define NFS3_getattrargs_sz (NFS3_fh_sz) +#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz) #define NFS3_accessargs_sz (NFS3_fh_sz+1) #define NFS3_readlinkargs_sz (NFS3_fh_sz) #define NFS3_readargs_sz (NFS3_fh_sz+3) @@ -57,14 +55,16 @@ #define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) #define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) #define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) +#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) #define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) #define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) -#define NFS3_readdirargs_sz (NFS3_fh_sz+2) +#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3) +#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4) #define NFS3_commitargs_sz (NFS3_fh_sz+3) -#define NFS3_attrstat_sz (1+NFS3_fattr_sz) -#define NFS3_wccstat_sz (1+NFS3_wcc_data_sz) -#define NFS3_removeres_sz (NFS3_wccstat_sz) +#define NFS3_getattrres_sz (1+NFS3_fattr_sz) +#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz) +#define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) #define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) @@ -100,1079 +100,2362 @@ static const umode_t nfs_type2fmt[] = { [NF3FIFO] = S_IFIFO, }; +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) +{ + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) { - dprintk("nfs: %s: prematurely hit end of receive buffer. " + dprintk("NFS: %s prematurely hit the end of our receive buffer. " "Remaining buffer length is %tu words.\n", func, xdr->end - xdr->p); } + /* - * Common NFS XDR functions as inlines + * Encode/decode NFSv3 basic data types + * + * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: + * "NFS Version 3 Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. */ -static inline __be32 * -xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh) + +static void encode_uint32(struct xdr_stream *xdr, u32 value) { - return xdr_encode_array(p, fh->data, fh->size); + __be32 *p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); } -static inline __be32 * -xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh) +static int decode_uint32(struct xdr_stream *xdr, u32 *value) { - if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { - memcpy(fh->data, p, fh->size); - return p + XDR_QUADLEN(fh->size); - } - return NULL; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *value = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_uint64(struct xdr_stream *xdr, u64 *value) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(p == NULL)) + goto out_overflow; + xdr_decode_hyper(p, value); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * fileid3 + * + * typedef uint64 fileid3; + */ +static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid) +{ + return xdr_decode_hyper(p, fileid); +} + +static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid) +{ + return decode_uint64(xdr, fileid); +} + +/* + * filename3 + * + * typedef string filename3<>; + */ +static void encode_filename3(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS3_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); } -static inline __be32 * -xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh) +static int decode_inline_filename3(struct xdr_stream *xdr, + const char **name, u32 *length) { __be32 *p; + u32 count; + p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) goto out_overflow; - fh->size = ntohl(*p++); + *name = (const char *)p; + *length = count; + return 0; - if (fh->size <= NFS3_FHSIZE) { - p = xdr_inline_decode(xdr, fh->size); - if (unlikely(!p)) - goto out_overflow; - memcpy(fh->data, p, fh->size); - return p + XDR_QUADLEN(fh->size); - } - return NULL; +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * nfspath3 + * + * typedef string nfspath3<>; + */ +static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, + const u32 length) +{ + BUG_ON(length > NFS3_MAXPATHLEN); + encode_uint32(xdr, length); + xdr_write_pages(xdr, pages, 0, length); +} +static int decode_nfspath3(struct xdr_stream *xdr) +{ + u32 recvd, count; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) + goto out_nametoolong; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; + + xdr_read_pages(xdr, count); + xdr_terminate_string(xdr->buf, count); + return 0; + +out_nametoolong: + dprintk("NFS: returned pathname too long: %u\n", count); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "count %u > recvd %u\n", count, recvd); + return -EIO; out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return -EIO; } /* - * Encode/decode time. + * cookie3 + * + * typedef uint64 cookie3 */ -static inline __be32 * -xdr_encode_time3(__be32 *p, struct timespec *timep) +static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie) { - *p++ = htonl(timep->tv_sec); - *p++ = htonl(timep->tv_nsec); - return p; + return xdr_encode_hyper(p, cookie); } -static inline __be32 * -xdr_decode_time3(__be32 *p, struct timespec *timep) +static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie) { - timep->tv_sec = ntohl(*p++); - timep->tv_nsec = ntohl(*p++); - return p; + return decode_uint64(xdr, cookie); +} + +/* + * cookieverf3 + * + * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE]; + */ +static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier) +{ + memcpy(p, verifier, NFS3_COOKIEVERFSIZE); + return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE); +} + +static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier, p, NFS3_COOKIEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * createverf3 + * + * typedef opaque createverf3[NFS3_CREATEVERFSIZE]; + */ +static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE); + memcpy(p, verifier, NFS3_CREATEVERFSIZE); +} + +static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier, p, NFS3_WRITEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * size3 + * + * typedef uint64 size3; + */ +static __be32 *xdr_decode_size3(__be32 *p, u64 *size) +{ + return xdr_decode_hyper(p, size); +} + +/* + * nfsstat3 + * + * enum nfsstat3 { + * NFS3_OK = 0, + * ... + * } + */ +#define NFS3_OK NFS_OK + +static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * ftype3 + * + * enum ftype3 { + * NF3REG = 1, + * NF3DIR = 2, + * NF3BLK = 3, + * NF3CHR = 4, + * NF3LNK = 5, + * NF3SOCK = 6, + * NF3FIFO = 7 + * }; + */ +static void encode_ftype3(struct xdr_stream *xdr, const u32 type) +{ + BUG_ON(type > NF3FIFO); + encode_uint32(xdr, type); } -static __be32 * -xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) +static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode) { - unsigned int type, major, minor; - umode_t fmode; + u32 type; - type = ntohl(*p++); + type = be32_to_cpup(p++); if (type > NF3FIFO) type = NF3NON; - fmode = nfs_type2fmt[type]; - fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; - fattr->nlink = ntohl(*p++); - fattr->uid = ntohl(*p++); - fattr->gid = ntohl(*p++); - p = xdr_decode_hyper(p, &fattr->size); - p = xdr_decode_hyper(p, &fattr->du.nfs3.used); - - /* Turn remote device info into Linux-specific dev_t */ - major = ntohl(*p++); - minor = ntohl(*p++); - fattr->rdev = MKDEV(major, minor); - if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor) - fattr->rdev = 0; + *mode = nfs_type2fmt[type]; + return p; +} - p = xdr_decode_hyper(p, &fattr->fsid.major); - fattr->fsid.minor = 0; - p = xdr_decode_hyper(p, &fattr->fileid); - p = xdr_decode_time3(p, &fattr->atime); - p = xdr_decode_time3(p, &fattr->mtime); - p = xdr_decode_time3(p, &fattr->ctime); +/* + * specdata3 + * + * struct specdata3 { + * uint32 specdata1; + * uint32 specdata2; + * }; + */ +static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev) +{ + __be32 *p; - /* Update the mode bits */ - fattr->valid |= NFS_ATTR_FATTR_V3; + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(MAJOR(rdev)); + *p = cpu_to_be32(MINOR(rdev)); +} + +static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev) +{ + unsigned int major, minor; + + major = be32_to_cpup(p++); + minor = be32_to_cpup(p++); + *rdev = MKDEV(major, minor); + if (MAJOR(*rdev) != major || MINOR(*rdev) != minor) + *rdev = 0; + return p; +} + +/* + * nfs_fh3 + * + * struct nfs_fh3 { + * opaque data<NFS3_FHSIZE>; + * }; + */ +static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + __be32 *p; + + BUG_ON(fh->size > NFS3_FHSIZE); + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); +} + +static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > NFS3_FHSIZE)) + goto out_toobig; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = length; + memcpy(fh->data, p, length); + return 0; +out_toobig: + dprintk("NFS: file handle size (%u) too big\n", length); + return -E2BIG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static void zero_nfs_fh3(struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(*fh)); +} + +/* + * nfstime3 + * + * struct nfstime3 { + * uint32 seconds; + * uint32 nseconds; + * }; + */ +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(timep->tv_nsec); return p; } -static inline __be32 * -xdr_encode_sattr(__be32 *p, struct iattr *attr) +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) { + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++); + return p; +} + +/* + * sattr3 + * + * enum time_how { + * DONT_CHANGE = 0, + * SET_TO_SERVER_TIME = 1, + * SET_TO_CLIENT_TIME = 2 + * }; + * + * union set_mode3 switch (bool set_it) { + * case TRUE: + * mode3 mode; + * default: + * void; + * }; + * + * union set_uid3 switch (bool set_it) { + * case TRUE: + * uid3 uid; + * default: + * void; + * }; + * + * union set_gid3 switch (bool set_it) { + * case TRUE: + * gid3 gid; + * default: + * void; + * }; + * + * union set_size3 switch (bool set_it) { + * case TRUE: + * size3 size; + * default: + * void; + * }; + * + * union set_atime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 atime; + * default: + * void; + * }; + * + * union set_mtime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 mtime; + * default: + * void; + * }; + * + * struct sattr3 { + * set_mode3 mode; + * set_uid3 uid; + * set_gid3 gid; + * set_size3 size; + * set_atime atime; + * set_mtime mtime; + * }; + */ +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) +{ + u32 nbytes; + __be32 *p; + + /* + * In order to make only a single xdr_reserve_space() call, + * pre-compute the total number of bytes to be reserved. + * Six boolean values, one for each set_foo field, are always + * present in the encoded result, so start there. + */ + nbytes = 6 * 4; + if (attr->ia_valid & ATTR_MODE) + nbytes += 4; + if (attr->ia_valid & ATTR_UID) + nbytes += 4; + if (attr->ia_valid & ATTR_GID) + nbytes += 4; + if (attr->ia_valid & ATTR_SIZE) + nbytes += 8; + if (attr->ia_valid & ATTR_ATIME_SET) + nbytes += 8; + if (attr->ia_valid & ATTR_MTIME_SET) + nbytes += 8; + p = xdr_reserve_space(xdr, nbytes); + if (attr->ia_valid & ATTR_MODE) { *p++ = xdr_one; - *p++ = htonl(attr->ia_mode & S_IALLUGO); - } else { + *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO); + } else *p++ = xdr_zero; - } + if (attr->ia_valid & ATTR_UID) { *p++ = xdr_one; - *p++ = htonl(attr->ia_uid); - } else { + *p++ = cpu_to_be32(attr->ia_uid); + } else *p++ = xdr_zero; - } + if (attr->ia_valid & ATTR_GID) { *p++ = xdr_one; - *p++ = htonl(attr->ia_gid); - } else { + *p++ = cpu_to_be32(attr->ia_gid); + } else *p++ = xdr_zero; - } + if (attr->ia_valid & ATTR_SIZE) { *p++ = xdr_one; - p = xdr_encode_hyper(p, (__u64) attr->ia_size); - } else { + p = xdr_encode_hyper(p, (u64)attr->ia_size); + } else *p++ = xdr_zero; - } + if (attr->ia_valid & ATTR_ATIME_SET) { *p++ = xdr_two; - p = xdr_encode_time3(p, &attr->ia_atime); + p = xdr_encode_nfstime3(p, &attr->ia_atime); } else if (attr->ia_valid & ATTR_ATIME) { *p++ = xdr_one; - } else { + } else *p++ = xdr_zero; - } + if (attr->ia_valid & ATTR_MTIME_SET) { *p++ = xdr_two; - p = xdr_encode_time3(p, &attr->ia_mtime); + xdr_encode_nfstime3(p, &attr->ia_mtime); } else if (attr->ia_valid & ATTR_MTIME) { - *p++ = xdr_one; - } else { - *p++ = xdr_zero; - } - return p; + *p = xdr_one; + } else + *p = xdr_zero; +} + +/* + * fattr3 + * + * struct fattr3 { + * ftype3 type; + * mode3 mode; + * uint32 nlink; + * uid3 uid; + * gid3 gid; + * size3 size; + * size3 used; + * specdata3 rdev; + * uint64 fsid; + * fileid3 fileid; + * nfstime3 atime; + * nfstime3 mtime; + * nfstime3 ctime; + * }; + */ +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + umode_t fmode; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + p = xdr_decode_ftype3(p, &fmode); + + fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; + fattr->nlink = be32_to_cpup(p++); + fattr->uid = be32_to_cpup(p++); + fattr->gid = be32_to_cpup(p++); + + p = xdr_decode_size3(p, &fattr->size); + p = xdr_decode_size3(p, &fattr->du.nfs3.used); + p = xdr_decode_specdata3(p, &fattr->rdev); + + p = xdr_decode_hyper(p, &fattr->fsid.major); + fattr->fsid.minor = 0; + + p = xdr_decode_fileid3(p, &fattr->fileid); + p = xdr_decode_nfstime3(p, &fattr->atime); + p = xdr_decode_nfstime3(p, &fattr->mtime); + xdr_decode_nfstime3(p, &fattr->ctime); + + fattr->valid |= NFS_ATTR_FATTR_V3; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static inline __be32 * -xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) +/* + * post_op_attr + * + * union post_op_attr switch (bool attributes_follow) { + * case TRUE: + * fattr3 attributes; + * case FALSE: + * void; + * }; + */ +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) { - p = xdr_decode_hyper(p, &fattr->pre_size); - p = xdr_decode_time3(p, &fattr->pre_mtime); - p = xdr_decode_time3(p, &fattr->pre_ctime); + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_fattr3(xdr, fattr); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * wcc_attr + * struct wcc_attr { + * size3 size; + * nfstime3 mtime; + * nfstime3 ctime; + * }; + */ +static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + fattr->valid |= NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_PRECTIME; - return p; -} -static inline __be32 * -xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) -{ - if (*p++) - p = xdr_decode_fattr(p, fattr); - return p; + p = xdr_decode_size3(p, &fattr->pre_size); + p = xdr_decode_nfstime3(p, &fattr->pre_mtime); + xdr_decode_nfstime3(p, &fattr->pre_ctime); + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static inline __be32 * -xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr) +/* + * pre_op_attr + * union pre_op_attr switch (bool attributes_follow) { + * case TRUE: + * wcc_attr attributes; + * case FALSE: + * void; + * }; + * + * wcc_data + * + * struct wcc_data { + * pre_op_attr before; + * post_op_attr after; + * }; + */ +static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) { __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + if (unlikely(p == NULL)) goto out_overflow; - if (ntohl(*p++)) { - p = xdr_inline_decode(xdr, 84); - if (unlikely(!p)) - goto out_overflow; - p = xdr_decode_fattr(p, fattr); - } - return p; + if (*p != xdr_zero) + return decode_wcc_attr(xdr, fattr); + return 0; out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return -EIO; } -static inline __be32 * -xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) { - if (*p++) - return xdr_decode_wcc_attr(p, fattr); - return p; + int error; + + error = decode_pre_op_attr(xdr, fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, fattr); +out: + return error; } +/* + * post_op_fh3 + * + * union post_op_fh3 switch (bool handle_follows) { + * case TRUE: + * nfs_fh3 handle; + * case FALSE: + * void; + * }; + */ +static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_nfs_fh3(xdr, fh); + zero_nfs_fh3(fh); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} -static inline __be32 * -xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr) +/* + * diropargs3 + * + * struct diropargs3 { + * nfs_fh3 dir; + * filename3 name; + * }; + */ +static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) { - p = xdr_decode_pre_op_attr(p, fattr); - return xdr_decode_post_op_attr(p, fattr); + encode_nfs_fh3(xdr, fh); + encode_filename3(xdr, name, length); } + /* - * NFS encode functions + * NFSv3 XDR encode functions + * + * NFSv3 argument types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". */ /* - * Encode file handle argument + * 3.3.1 GETATTR3args + * + * struct GETATTR3args { + * nfs_fh3 object; + * }; */ -static int -nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) +static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) { - p = xdr_encode_fhandle(p, fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_nfs_fh3(xdr, fh); } /* - * Encode SETATTR arguments + * 3.3.2 SETATTR3args + * + * union sattrguard3 switch (bool check) { + * case TRUE: + * nfstime3 obj_ctime; + * case FALSE: + * void; + * }; + * + * struct SETATTR3args { + * nfs_fh3 object; + * sattr3 new_attributes; + * sattrguard3 guard; + * }; */ -static int -nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args) -{ - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_sattr(p, args->sattr); - *p++ = htonl(args->guard); - if (args->guard) - p = xdr_encode_time3(p, &args->guardtime); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; +static void encode_sattrguard3(struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + __be32 *p; + + if (args->guard) { + p = xdr_reserve_space(xdr, 4 + 8); + *p++ = xdr_one; + xdr_encode_nfstime3(p, &args->guardtime); + } else { + p = xdr_reserve_space(xdr, 4); + *p = xdr_zero; + } +} + +static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_sattr3(xdr, args->sattr); + encode_sattrguard3(xdr, args); } /* - * Encode directory ops argument + * 3.3.3 LOOKUP3args + * + * struct LOOKUP3args { + * diropargs3 what; + * }; */ -static int -nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args) +static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_diropargs *args) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_diropargs3(xdr, args->fh, args->name, args->len); } /* - * Encode REMOVE argument + * 3.3.4 ACCESS3args + * + * struct ACCESS3args { + * nfs_fh3 object; + * uint32 access; + * }; */ -static int -nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static void encode_access3args(struct xdr_stream *xdr, + const struct nfs3_accessargs *args) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name.name, args->name.len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->access); +} + +static void nfs3_xdr_enc_access3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_access3args(xdr, args); } /* - * Encode access() argument + * 3.3.5 READLINK3args + * + * struct READLINK3args { + * nfs_fh3 symlink; + * }; */ -static int -nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args) +static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readlinkargs *args) { - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->access); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_nfs_fh3(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); } /* - * Arguments to a READ call. Since we read data directly into the page - * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page we want to fetch. + * 3.3.6 READ3args + * + * struct READ3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; */ -static int -nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static void encode_read3args(struct xdr_stream *xdr, + const struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - u32 count = args->count; + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); - p = xdr_encode_fhandle(p, args->fh); + p = xdr_reserve_space(xdr, 8 + 4); p = xdr_encode_hyper(p, args->offset); - *p++ = htonl(count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + *p = cpu_to_be32(args->count); +} - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, - args->pages, args->pgbase, count); +static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + encode_read3args(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS3_readres_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; - return 0; } /* - * Write arguments. Splice the buffer to be written into the iovec. + * 3.3.7 WRITE3args + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * stable_how stable; + * opaque data<>; + * }; */ -static int -nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_write3args(struct xdr_stream *xdr, + const struct nfs_writeargs *args) { - struct xdr_buf *sndbuf = &req->rq_snd_buf; - u32 count = args->count; + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); - p = xdr_encode_fhandle(p, args->fh); + p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4); p = xdr_encode_hyper(p, args->offset); - *p++ = htonl(count); - *p++ = htonl(args->stable); - *p++ = htonl(count); - sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); - - /* Copy the page array */ - xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); - sndbuf->flags |= XDRBUF_WRITE; - return 0; + *p++ = cpu_to_be32(args->count); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); +} + +static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + encode_write3args(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; } /* - * Encode CREATE arguments + * 3.3.8 CREATE3args + * + * enum createmode3 { + * UNCHECKED = 0, + * GUARDED = 1, + * EXCLUSIVE = 2 + * }; + * + * union createhow3 switch (createmode3 mode) { + * case UNCHECKED: + * case GUARDED: + * sattr3 obj_attributes; + * case EXCLUSIVE: + * createverf3 verf; + * }; + * + * struct CREATE3args { + * diropargs3 where; + * createhow3 how; + * }; */ -static int -nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args) +static void encode_createhow3(struct xdr_stream *xdr, + const struct nfs3_createargs *args) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - - *p++ = htonl(args->createmode); - if (args->createmode == NFS3_CREATE_EXCLUSIVE) { - *p++ = args->verifier[0]; - *p++ = args->verifier[1]; - } else - p = xdr_encode_sattr(p, args->sattr); + encode_uint32(xdr, args->createmode); + switch (args->createmode) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + encode_sattr3(xdr, args->sattr); + break; + case NFS3_CREATE_EXCLUSIVE: + encode_createverf3(xdr, args->verifier); + break; + default: + BUG(); + } +} - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; +static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_createhow3(xdr, args); } /* - * Encode MKDIR arguments + * 3.3.9 MKDIR3args + * + * struct MKDIR3args { + * diropargs3 where; + * sattr3 attributes; + * }; */ -static int -nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args) +static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mkdirargs *args) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_sattr3(xdr, args->sattr); } /* - * Encode SYMLINK arguments + * 3.3.10 SYMLINK3args + * + * struct symlinkdata3 { + * sattr3 symlink_attributes; + * nfspath3 symlink_data; + * }; + * + * struct SYMLINK3args { + * diropargs3 where; + * symlinkdata3 symlink; + * }; */ -static int -nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args) +static void encode_symlinkdata3(struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_sattr(p, args->sattr); - *p++ = htonl(args->pathlen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + encode_sattr3(xdr, args->sattr); + encode_nfspath3(xdr, args->pages, args->pathlen); +} - /* Copy the page */ - xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); - return 0; +static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) +{ + encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); + encode_symlinkdata3(xdr, args); } /* - * Encode MKNOD arguments + * 3.3.11 MKNOD3args + * + * struct devicedata3 { + * sattr3 dev_attributes; + * specdata3 spec; + * }; + * + * union mknoddata3 switch (ftype3 type) { + * case NF3CHR: + * case NF3BLK: + * devicedata3 device; + * case NF3SOCK: + * case NF3FIFO: + * sattr3 pipe_attributes; + * default: + * void; + * }; + * + * struct MKNOD3args { + * diropargs3 where; + * mknoddata3 what; + * }; */ -static int -nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) -{ - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - *p++ = htonl(args->type); - p = xdr_encode_sattr(p, args->sattr); - if (args->type == NF3CHR || args->type == NF3BLK) { - *p++ = htonl(MAJOR(args->rdev)); - *p++ = htonl(MINOR(args->rdev)); +static void encode_devicedata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_sattr3(xdr, args->sattr); + encode_specdata3(xdr, args->rdev); +} + +static void encode_mknoddata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_ftype3(xdr, args->type); + switch (args->type) { + case NF3CHR: + case NF3BLK: + encode_devicedata3(xdr, args); + break; + case NF3SOCK: + case NF3FIFO: + encode_sattr3(xdr, args->sattr); + break; + case NF3REG: + case NF3DIR: + break; + default: + BUG(); } +} - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; +static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_mknoddata3(xdr, args); } /* - * Encode RENAME arguments + * 3.3.12 REMOVE3args + * + * struct REMOVE3args { + * diropargs3 object; + * }; */ -static int -nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) -{ - p = xdr_encode_fhandle(p, args->old_dir); - p = xdr_encode_array(p, args->old_name->name, args->old_name->len); - p = xdr_encode_fhandle(p, args->new_dir); - p = xdr_encode_array(p, args->new_name->name, args->new_name->len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; +static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name.name, args->name.len); } /* - * Encode LINK arguments + * 3.3.14 RENAME3args + * + * struct RENAME3args { + * diropargs3 from; + * diropargs3 to; + * }; */ -static int -nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) +static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs3(xdr, args->old_dir, old->name, old->len); + encode_diropargs3(xdr, args->new_dir, new->name, new->len); } /* - * Encode arguments to readdir call + * 3.3.15 LINK3args + * + * struct LINK3args { + * nfs_fh3 file; + * diropargs3 link; + * }; */ -static int -nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) +static void nfs3_xdr_enc_link3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_linkargs *args) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - u32 count = args->count; - - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->cookie); - *p++ = args->verf[0]; - *p++ = args->verf[1]; - if (args->plus) { - /* readdirplus: need dircount + buffer size. - * We just make sure we make dircount big enough */ - *p++ = htonl(count >> 3); - } - *p++ = htonl(count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); - return 0; + encode_nfs_fh3(xdr, args->fromfh); + encode_diropargs3(xdr, args->tofh, args->toname, args->tolen); } /* - * Decode the result of a readdir call. - * We just check for syntactical correctness. + * 3.3.16 READDIR3args + * + * struct READDIR3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 count; + * }; */ -static int -nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res) +static void encode_readdir3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - struct page **page; - size_t hdrlen; - u32 recvd, pglen; - int status; - - status = ntohl(*p++); - /* Decode post_op_attrs */ - p = xdr_decode_post_op_attr(p, res->dir_attr); - if (status) - return nfs_stat_to_errno(status); - /* Decode verifier cookie */ - if (res->verf) { - res->verf[0] = *p++; - res->verf[1] = *p++; - } else { - p += 2; - } + __be32 *p; - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READDIR reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } + encode_nfs_fh3(xdr, args->fh); - pglen = rcvbuf->page_len; - recvd = rcvbuf->len - hdrlen; - if (pglen > recvd) - pglen = recvd; - page = rcvbuf->pages; + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); + *p = cpu_to_be32(args->count); +} - return pglen; +static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdir3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); } -__be32 * -nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) +/* + * 3.3.17 READDIRPLUS3args + * + * struct READDIRPLUS3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 dircount; + * count3 maxcount; + * }; + */ +static void encode_readdirplus3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) { __be32 *p; - struct nfs_entry old = *entry; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - if (!ntohl(*p++)) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - if (!ntohl(*p++)) - return ERR_PTR(-EAGAIN); - entry->eof = 1; - return ERR_PTR(-EBADCOOKIE); - } - p = xdr_inline_decode(xdr, 12); - if (unlikely(!p)) - goto out_overflow; - p = xdr_decode_hyper(p, &entry->ino); - entry->len = ntohl(*p++); + encode_nfs_fh3(xdr, args->fh); - p = xdr_inline_decode(xdr, entry->len + 8); - if (unlikely(!p)) - goto out_overflow; - entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); - entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); - - entry->d_type = DT_UNKNOWN; - if (plus) { - entry->fattr->valid = 0; - p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); - if (IS_ERR(p)) - goto out_overflow_exit; - entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - /* In fact, a post_op_fh3: */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - if (*p++) { - p = xdr_decode_fhandle_stream(xdr, entry->fh); - if (IS_ERR(p)) - goto out_overflow_exit; - /* Ugh -- server reply was truncated */ - if (p == NULL) { - dprintk("NFS: FH truncated\n"); - *entry = old; - return ERR_PTR(-EAGAIN); - } - } else - memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); - } + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); - p = xdr_inline_peek(xdr, 8); - if (p != NULL) - entry->eof = !p[0] && p[1]; - else - entry->eof = 0; + /* + * readdirplus: need dircount + buffer size. + * We just make sure we make dircount big enough + */ + *p++ = cpu_to_be32(args->count >> 3); - return p; + *p = cpu_to_be32(args->count); +} -out_overflow: - print_overflow_msg(__func__, xdr); -out_overflow_exit: - return ERR_PTR(-EAGAIN); +static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdirplus3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); } /* - * Encode COMMIT arguments + * 3.3.21 COMMIT3args + * + * struct COMMIT3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; */ -static int -nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_commit3args(struct xdr_stream *xdr, + const struct nfs_writeargs *args) { - p = xdr_encode_fhandle(p, args->fh); + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4); p = xdr_encode_hyper(p, args->offset); - *p++ = htonl(args->count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + *p = cpu_to_be32(args->count); } -#ifdef CONFIG_NFS_V3_ACL -/* - * Encode GETACL arguments - */ -static int -nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, - struct nfs3_getaclargs *args) +static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; + encode_commit3args(xdr, args); +} - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->mask); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +#ifdef CONFIG_NFS_V3_ACL - if (args->mask & (NFS_ACL | NFS_DFACL)) { - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + - ACL3_getaclres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, - NFSACL_MAXPAGES << PAGE_SHIFT); - } - return 0; +static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_getaclargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->mask); + if (args->mask & (NFS_ACL | NFS_DFACL)) + prepare_reply_buffer(req, args->pages, 0, + NFSACL_MAXPAGES << PAGE_SHIFT, + ACL3_getaclres_sz); } -/* - * Encode SETACL arguments - */ -static int -nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, - struct nfs3_setaclargs *args) +static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_setaclargs *args) { - struct xdr_buf *buf = &req->rq_snd_buf; unsigned int base; - int err; - - p = xdr_encode_fhandle(p, NFS_FH(args->inode)); - *p++ = htonl(args->mask); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - base = req->rq_slen; + int error; + encode_nfs_fh3(xdr, NFS_FH(args->inode)); + encode_uint32(xdr, args->mask); if (args->npages != 0) - xdr_encode_pages(buf, args->pages, 0, args->len); - else - req->rq_slen = xdr_adjust_iovec(req->rq_svec, - p + XDR_QUADLEN(args->len)); + xdr_write_pages(xdr, args->pages, 0, args->len); - err = nfsacl_encode(buf, base, args->inode, + base = req->rq_slen; + error = nfsacl_encode(xdr->buf, base, args->inode, (args->mask & NFS_ACL) ? args->acl_access : NULL, 1, 0); - if (err > 0) - err = nfsacl_encode(buf, base + err, args->inode, - (args->mask & NFS_DFACL) ? - args->acl_default : NULL, 1, - NFS_ACL_DEFAULT); - return (err > 0) ? 0 : err; + BUG_ON(error < 0); + error = nfsacl_encode(xdr->buf, base + error, args->inode, + (args->mask & NFS_DFACL) ? + args->acl_default : NULL, 1, + NFS_ACL_DEFAULT); + BUG_ON(error < 0); } + #endif /* CONFIG_NFS_V3_ACL */ /* - * NFS XDR decode functions + * NFSv3 XDR decode functions + * + * NFSv3 result types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". */ /* - * Decode attrstat reply. + * 3.3.1 GETATTR3res + * + * struct GETATTR3resok { + * fattr3 obj_attributes; + * }; + * + * union GETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * GETATTR3resok resok; + * default: + * void; + * }; */ -static int -nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) { - int status; - - if ((status = ntohl(*p++))) - return nfs_stat_to_errno(status); - xdr_decode_fattr(p, fattr); - return 0; + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_fattr3(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } /* - * Decode status+wcc_data reply - * SATTR, REMOVE, RMDIR + * 3.3.2 SETATTR3res + * + * struct SETATTR3resok { + * wcc_data obj_wcc; + * }; + * + * struct SETATTR3resfail { + * wcc_data obj_wcc; + * }; + * + * union SETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * SETATTR3resok resok; + * default: + * SETATTR3resfail resfail; + * }; */ -static int -nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) { - int status; - - if ((status = ntohl(*p++))) - status = nfs_stat_to_errno(status); - xdr_decode_wcc_data(p, fattr); - return status; + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); } -static int -nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) +/* + * 3.3.3 LOOKUP3res + * + * struct LOOKUP3resok { + * nfs_fh3 object; + * post_op_attr obj_attributes; + * post_op_attr dir_attributes; + * }; + * + * struct LOOKUP3resfail { + * post_op_attr dir_attributes; + * }; + * + * union LOOKUP3res switch (nfsstat3 status) { + * case NFS3_OK: + * LOOKUP3resok resok; + * default: + * LOOKUP3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) { - return nfs3_xdr_wccstat(req, p, res->dir_attr); + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfs_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->dir_attr); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs_stat_to_errno(status); } /* - * Decode LOOKUP reply + * 3.3.4 ACCESS3res + * + * struct ACCESS3resok { + * post_op_attr obj_attributes; + * uint32 access; + * }; + * + * struct ACCESS3resfail { + * post_op_attr obj_attributes; + * }; + * + * union ACCESS3res switch (nfsstat3 status) { + * case NFS3_OK: + * ACCESS3resok resok; + * default: + * ACCESS3resfail resfail; + * }; */ -static int -nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) +static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_accessres *result) { - int status; - - if ((status = ntohl(*p++))) { - status = nfs_stat_to_errno(status); - } else { - if (!(p = xdr_decode_fhandle(p, res->fh))) - return -errno_NFSERR_IO; - p = xdr_decode_post_op_attr(p, res->fattr); - } - xdr_decode_post_op_attr(p, res->dir_attr); - return status; + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_uint32(xdr, &result->access); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } /* - * Decode ACCESS reply + * 3.3.5 READLINK3res + * + * struct READLINK3resok { + * post_op_attr symlink_attributes; + * nfspath3 data; + * }; + * + * struct READLINK3resfail { + * post_op_attr symlink_attributes; + * }; + * + * union READLINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * READLINK3resok resok; + * default: + * READLINK3resfail resfail; + * }; */ -static int -nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) +static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) { - int status = ntohl(*p++); - - p = xdr_decode_post_op_attr(p, res->fattr); - if (status) - return nfs_stat_to_errno(status); - res->access = ntohl(*p++); - return 0; + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfspath3(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } -static int -nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) +/* + * 3.3.6 READ3res + * + * struct READ3resok { + * post_op_attr file_attributes; + * count3 count; + * bool eof; + * opaque data<>; + * }; + * + * struct READ3resfail { + * post_op_attr file_attributes; + * }; + * + * union READ3res switch (nfsstat3 status) { + * case NFS3_OK: + * READ3resok resok; + * default: + * READ3resfail resfail; + * }; + */ +static int decode_read3resok(struct xdr_stream *xdr, + struct nfs_readres *result) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; + u32 eof, count, ocount, recvd; + size_t hdrlen; + __be32 *p; - p = xdr_encode_fhandle(p, args->fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + p = xdr_inline_decode(xdr, 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p++); + eof = be32_to_cpup(p++); + ocount = be32_to_cpup(p++); + if (unlikely(ocount != count)) + goto out_mismatch; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; + +out: + xdr_read_pages(xdr, count); + result->eof = eof; + result->count = count; + return count; +out_mismatch: + dprintk("NFS: READ count doesn't match length of opaque: " + "count %u != ocount %u\n", count, ocount); + return -EIO; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); - return 0; +static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_read3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); } /* - * Decode READLINK reply + * 3.3.7 WRITE3res + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3resok { + * wcc_data file_wcc; + * count3 count; + * stable_how committed; + * writeverf3 verf; + * }; + * + * struct WRITE3resfail { + * wcc_data file_wcc; + * }; + * + * union WRITE3res switch (nfsstat3 status) { + * case NFS3_OK: + * WRITE3resok resok; + * default: + * WRITE3resfail resfail; + * }; */ -static int -nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int decode_write3resok(struct xdr_stream *xdr, + struct nfs_writeres *result) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; - u32 len, recvd; - int status; - - status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, fattr); - - if (status != 0) - return nfs_stat_to_errno(status); - - /* Convert length of symlink */ - len = ntohl(*p++); - if (len >= rcvbuf->page_len) { - dprintk("nfs: server returned giant symlink!\n"); - return -ENAMETOOLONG; - } + __be32 *p; - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READLINK reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READLINK header is short. " - "iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } - recvd = req->rq_rcv_buf.len - hdrlen; - if (recvd < len) { - dprintk("NFS: server cheating in readlink reply: " - "count %u > recvd %u\n", len, recvd); - return -EIO; - } + p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + result->count = be32_to_cpup(p++); + result->verf->committed = be32_to_cpup(p++); + if (unlikely(result->verf->committed > NFS_FILE_SYNC)) + goto out_badvalue; + memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE); + return result->count; +out_badvalue: + dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} - xdr_terminate_string(rcvbuf, len); - return 0; +static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_write3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); } /* - * Decode READ reply + * 3.3.8 CREATE3res + * + * struct CREATE3resok { + * post_op_fh3 obj; + * post_op_attr obj_attributes; + * wcc_data dir_wcc; + * }; + * + * struct CREATE3resfail { + * wcc_data dir_wcc; + * }; + * + * union CREATE3res switch (nfsstat3 status) { + * case NFS3_OK: + * CREATE3resok resok; + * default: + * CREATE3resfail resfail; + * }; */ -static int -nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) +static int decode_create3resok(struct xdr_stream *xdr, + struct nfs3_diropres *result) { - struct kvec *iov = req->rq_rcv_buf.head; - size_t hdrlen; - u32 count, ocount, recvd; - int status; + int error; + + error = decode_post_op_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + /* The server isn't required to return a file handle. + * If it didn't, force the client to perform a LOOKUP + * to determine the correct file handle and attribute + * values for the new object. */ + if (result->fh->size == 0) + result->fattr->valid = 0; + error = decode_wcc_data(xdr, result->dir_attr); +out: + return error; +} - status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, res->fattr); +static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_create3resok(xdr, result); +out: + return error; +out_default: + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs_stat_to_errno(status); +} - if (status != 0) - return nfs_stat_to_errno(status); +/* + * 3.3.12 REMOVE3res + * + * struct REMOVE3resok { + * wcc_data dir_wcc; + * }; + * + * struct REMOVE3resfail { + * wcc_data dir_wcc; + * }; + * + * union REMOVE3res switch (nfsstat3 status) { + * case NFS3_OK: + * REMOVE3resok resok; + * default: + * REMOVE3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_removeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} - /* Decode reply count and EOF flag. NFSv3 is somewhat redundant - * in that it puts the count both in the res struct and in the - * opaque data count. */ - count = ntohl(*p++); - res->eof = ntohl(*p++); - ocount = ntohl(*p++); +/* + * 3.3.14 RENAME3res + * + * struct RENAME3resok { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * struct RENAME3resfail { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * union RENAME3res switch (nfsstat3 status) { + * case NFS3_OK: + * RENAME3resok resok; + * default: + * RENAME3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_renameres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->old_fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->new_fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} - if (ocount != count) { - dprintk("NFS: READ count doesn't match RPC opaque count.\n"); - return -errno_NFSERR_IO; - } +/* + * 3.3.15 LINK3res + * + * struct LINK3resok { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * struct LINK3resfail { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * union LINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * LINK3resok resok; + * default: + * LINK3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs3_linkres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs_stat_to_errno(status); +} - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READ reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READ header is short. iovec will be shifted.\n"); - xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); - } +/** + * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in + * the local page cache + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 3.3.16 entry3 + * + * struct entry3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * fhandle3 filehandle; + * post_op_attr3 attributes; + * entry3 *nextentry; + * }; + * + * 3.3.17 entryplus3 + * struct entryplus3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * post_op_attr name_attributes; + * post_op_fh3 name_handle; + * entryplus3 *nextentry; + * }; + */ +int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + struct nfs_entry old = *entry; + __be32 *p; + int error; - recvd = req->rq_rcv_buf.len - hdrlen; - if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - count = recvd; - res->eof = 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; } - if (count < res->count) - res->count = count; + error = decode_fileid3(xdr, &entry->ino); + if (unlikely(error)) + return error; - return count; -} + error = decode_inline_filename3(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; -/* - * Decode WRITE response - */ -static int -nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) -{ - int status; + entry->prev_cookie = entry->cookie; + error = decode_cookie3(xdr, &entry->cookie); + if (unlikely(error)) + return error; - status = ntohl(*p++); - p = xdr_decode_wcc_data(p, res->fattr); + entry->d_type = DT_UNKNOWN; - if (status != 0) - return nfs_stat_to_errno(status); + if (plus) { + entry->fattr->valid = 0; + error = decode_post_op_attr(xdr, entry->fattr); + if (unlikely(error)) + return error; + if (entry->fattr->valid & NFS_ATTR_FATTR_V3) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - res->count = ntohl(*p++); - res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); - res->verf->verifier[0] = *p++; - res->verf->verifier[1] = *p++; + /* In fact, a post_op_fh3: */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) { + error = decode_nfs_fh3(xdr, entry->fh); + if (unlikely(error)) { + if (error == -E2BIG) + goto out_truncated; + return error; + } + } else + zero_nfs_fh3(entry->fh); + } - return res->count; -} + return 0; -/* - * Decode a CREATE response - */ -static int -nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) -{ - int status; - - status = ntohl(*p++); - if (status == 0) { - if (*p++) { - if (!(p = xdr_decode_fhandle(p, res->fh))) - return -errno_NFSERR_IO; - p = xdr_decode_post_op_attr(p, res->fattr); - } else { - memset(res->fh, 0, sizeof(*res->fh)); - /* Do decode post_op_attr but set it to NULL */ - p = xdr_decode_post_op_attr(p, res->fattr); - res->fattr->valid = 0; - } - } else { - status = nfs_stat_to_errno(status); - } - p = xdr_decode_wcc_data(p, res->dir_attr); - return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +out_truncated: + dprintk("NFS: directory entry contains invalid file handle\n"); + *entry = old; + return -EAGAIN; } /* - * Decode RENAME reply + * 3.3.16 READDIR3res + * + * struct dirlist3 { + * entry3 *entries; + * bool eof; + * }; + * + * struct READDIR3resok { + * post_op_attr dir_attributes; + * cookieverf3 cookieverf; + * dirlist3 reply; + * }; + * + * struct READDIR3resfail { + * post_op_attr dir_attributes; + * }; + * + * union READDIR3res switch (nfsstat3 status) { + * case NFS3_OK: + * READDIR3resok resok; + * default: + * READDIR3resfail resfail; + * }; + * + * Read the directory contents into the page cache, but otherwise + * don't touch them. The actual decoding is done by nfs3_decode_entry() + * during subsequent nfs_readdir() calls. */ -static int -nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res) +static int decode_dirlist3(struct xdr_stream *xdr) { - int status; + u32 recvd, pglen; + size_t hdrlen; - if ((status = ntohl(*p++)) != 0) - status = nfs_stat_to_errno(status); - p = xdr_decode_wcc_data(p, res->old_fattr); - p = xdr_decode_wcc_data(p, res->new_fattr); - return status; + pglen = xdr->buf->page_len; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(pglen > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, pglen); + return pglen; +out_cheating: + dprintk("NFS: server cheating in readdir result: " + "pglen %u > recvd %u\n", pglen, recvd); + pglen = recvd; + goto out; } -/* - * Decode LINK reply - */ -static int -nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res) +static int decode_readdir3resok(struct xdr_stream *xdr, + struct nfs3_readdirres *result) { - int status; + int error; + + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + /* XXX: do we need to check if result->verf != NULL ? */ + error = decode_cookieverf3(xdr, result->verf); + if (unlikely(error)) + goto out; + error = decode_dirlist3(xdr); +out: + return error; +} - if ((status = ntohl(*p++)) != 0) - status = nfs_stat_to_errno(status); - p = xdr_decode_post_op_attr(p, res->fattr); - p = xdr_decode_wcc_data(p, res->dir_attr); - return status; +static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_readdir3resok(xdr, result); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs_stat_to_errno(status); } /* - * Decode FSSTAT reply + * 3.3.18 FSSTAT3res + * + * struct FSSTAT3resok { + * post_op_attr obj_attributes; + * size3 tbytes; + * size3 fbytes; + * size3 abytes; + * size3 tfiles; + * size3 ffiles; + * size3 afiles; + * uint32 invarsec; + * }; + * + * struct FSSTAT3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSSTAT3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSSTAT3resok resok; + * default: + * FSSTAT3resfail resfail; + * }; */ -static int -nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) +static int decode_fsstat3resok(struct xdr_stream *xdr, + struct nfs_fsstat *result) { - int status; - - status = ntohl(*p++); - - p = xdr_decode_post_op_attr(p, res->fattr); - if (status != 0) - return nfs_stat_to_errno(status); - - p = xdr_decode_hyper(p, &res->tbytes); - p = xdr_decode_hyper(p, &res->fbytes); - p = xdr_decode_hyper(p, &res->abytes); - p = xdr_decode_hyper(p, &res->tfiles); - p = xdr_decode_hyper(p, &res->ffiles); - p = xdr_decode_hyper(p, &res->afiles); + __be32 *p; + p = xdr_inline_decode(xdr, 8 * 6 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + p = xdr_decode_size3(p, &result->tbytes); + p = xdr_decode_size3(p, &result->fbytes); + p = xdr_decode_size3(p, &result->abytes); + p = xdr_decode_size3(p, &result->tfiles); + p = xdr_decode_size3(p, &result->ffiles); + xdr_decode_size3(p, &result->afiles); /* ignore invarsec */ return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsstat3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); } /* - * Decode FSINFO reply + * 3.3.19 FSINFO3res + * + * struct FSINFO3resok { + * post_op_attr obj_attributes; + * uint32 rtmax; + * uint32 rtpref; + * uint32 rtmult; + * uint32 wtmax; + * uint32 wtpref; + * uint32 wtmult; + * uint32 dtpref; + * size3 maxfilesize; + * nfstime3 time_delta; + * uint32 properties; + * }; + * + * struct FSINFO3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSINFO3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSINFO3resok resok; + * default: + * FSINFO3resfail resfail; + * }; */ -static int -nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) +static int decode_fsinfo3resok(struct xdr_stream *xdr, + struct nfs_fsinfo *result) { - int status; - - status = ntohl(*p++); - - p = xdr_decode_post_op_attr(p, res->fattr); - if (status != 0) - return nfs_stat_to_errno(status); + __be32 *p; - res->rtmax = ntohl(*p++); - res->rtpref = ntohl(*p++); - res->rtmult = ntohl(*p++); - res->wtmax = ntohl(*p++); - res->wtpref = ntohl(*p++); - res->wtmult = ntohl(*p++); - res->dtpref = ntohl(*p++); - p = xdr_decode_hyper(p, &res->maxfilesize); - p = xdr_decode_time3(p, &res->time_delta); + p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + result->rtmax = be32_to_cpup(p++); + result->rtpref = be32_to_cpup(p++); + result->rtmult = be32_to_cpup(p++); + result->wtmax = be32_to_cpup(p++); + result->wtpref = be32_to_cpup(p++); + result->wtmult = be32_to_cpup(p++); + result->dtpref = be32_to_cpup(p++); + p = xdr_decode_size3(p, &result->maxfilesize); + xdr_decode_nfstime3(p, &result->time_delta); /* ignore properties */ - res->lease_time = 0; + result->lease_time = 0; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsinfo *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsinfo3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); } /* - * Decode PATHCONF reply + * 3.3.20 PATHCONF3res + * + * struct PATHCONF3resok { + * post_op_attr obj_attributes; + * uint32 linkmax; + * uint32 name_max; + * bool no_trunc; + * bool chown_restricted; + * bool case_insensitive; + * bool case_preserving; + * }; + * + * struct PATHCONF3resfail { + * post_op_attr obj_attributes; + * }; + * + * union PATHCONF3res switch (nfsstat3 status) { + * case NFS3_OK: + * PATHCONF3resok resok; + * default: + * PATHCONF3resfail resfail; + * }; */ -static int -nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) +static int decode_pathconf3resok(struct xdr_stream *xdr, + struct nfs_pathconf *result) { - int status; - - status = ntohl(*p++); - - p = xdr_decode_post_op_attr(p, res->fattr); - if (status != 0) - return nfs_stat_to_errno(status); - res->max_link = ntohl(*p++); - res->max_namelen = ntohl(*p++); + __be32 *p; + p = xdr_inline_decode(xdr, 4 * 6); + if (unlikely(p == NULL)) + goto out_overflow; + result->max_link = be32_to_cpup(p++); + result->max_namelen = be32_to_cpup(p); /* ignore remaining fields */ return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_pathconf3resok(xdr, result); +out: + return error; +out_status: + return nfs_stat_to_errno(status); } /* - * Decode COMMIT reply + * 3.3.21 COMMIT3res + * + * struct COMMIT3resok { + * wcc_data file_wcc; + * writeverf3 verf; + * }; + * + * struct COMMIT3resfail { + * wcc_data file_wcc; + * }; + * + * union COMMIT3res switch (nfsstat3 status) { + * case NFS3_OK: + * COMMIT3resok resok; + * default: + * COMMIT3resfail resfail; + * }; */ -static int -nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) +static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_writeres *result) { - int status; - - status = ntohl(*p++); - p = xdr_decode_wcc_data(p, res->fattr); - if (status != 0) - return nfs_stat_to_errno(status); - - res->verf->verifier[0] = *p++; - res->verf->verifier[1] = *p++; - return 0; + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_writeverf3(xdr, result->verf->verifier); +out: + return error; +out_status: + return nfs_stat_to_errno(status); } #ifdef CONFIG_NFS_V3_ACL -/* - * Decode GETACL reply - */ -static int -nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p, - struct nfs3_getaclres *res) + +static inline int decode_getacl3resok(struct xdr_stream *xdr, + struct nfs3_getaclres *result) { - struct xdr_buf *buf = &req->rq_rcv_buf; - int status = ntohl(*p++); struct posix_acl **acl; unsigned int *aclcnt; - int err, base; - - if (status != 0) - return nfs_stat_to_errno(status); - p = xdr_decode_post_op_attr(p, res->fattr); - res->mask = ntohl(*p++); - if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) - return -EINVAL; - base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base; - - acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; - aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; - err = nfsacl_decode(buf, base, aclcnt, acl); - - acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; - aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; - if (err > 0) - err = nfsacl_decode(buf, base + err, aclcnt, acl); - return (err > 0) ? 0 : err; + size_t hdrlen; + int error; + + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_uint32(xdr, &result->mask); + if (unlikely(error)) + goto out; + error = -EINVAL; + if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + goto out; + + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + + acl = NULL; + if (result->mask & NFS_ACL) + acl = &result->acl_access; + aclcnt = NULL; + if (result->mask & NFS_ACLCNT) + aclcnt = &result->acl_access_count; + error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl); + if (unlikely(error <= 0)) + goto out; + + acl = NULL; + if (result->mask & NFS_DFACL) + acl = &result->acl_default; + aclcnt = NULL; + if (result->mask & NFS_DFACLCNT) + aclcnt = &result->acl_default_count; + error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl); + if (unlikely(error <= 0)) + return error; + error = 0; +out: + return error; } -/* - * Decode setacl reply. - */ -static int -nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_getaclres *result) { - int status = ntohl(*p++); + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_getacl3resok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} - if (status) - return nfs_stat_to_errno(status); - xdr_decode_post_op_attr(p, fattr); - return 0; +static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_post_op_attr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } + #endif /* CONFIG_NFS_V3_ACL */ #define PROC(proc, argtype, restype, timer) \ [NFS3PROC_##proc] = { \ .p_proc = NFS3PROC_##proc, \ - .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ - .p_arglen = NFS3_##argtype##_sz, \ - .p_replen = NFS3_##restype##_sz, \ + .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \ + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \ + .p_arglen = NFS3_##argtype##args_sz, \ + .p_replen = NFS3_##restype##res_sz, \ .p_timer = timer, \ .p_statidx = NFS3PROC_##proc, \ .p_name = #proc, \ } struct rpc_procinfo nfs3_procedures[] = { - PROC(GETATTR, fhandle, attrstat, 1), - PROC(SETATTR, sattrargs, wccstat, 0), - PROC(LOOKUP, diropargs, lookupres, 2), - PROC(ACCESS, accessargs, accessres, 1), - PROC(READLINK, readlinkargs, readlinkres, 3), - PROC(READ, readargs, readres, 3), - PROC(WRITE, writeargs, writeres, 4), - PROC(CREATE, createargs, createres, 0), - PROC(MKDIR, mkdirargs, createres, 0), - PROC(SYMLINK, symlinkargs, createres, 0), - PROC(MKNOD, mknodargs, createres, 0), - PROC(REMOVE, removeargs, removeres, 0), - PROC(RMDIR, diropargs, wccstat, 0), - PROC(RENAME, renameargs, renameres, 0), - PROC(LINK, linkargs, linkres, 0), - PROC(READDIR, readdirargs, readdirres, 3), - PROC(READDIRPLUS, readdirargs, readdirres, 3), - PROC(FSSTAT, fhandle, fsstatres, 0), - PROC(FSINFO, fhandle, fsinfores, 0), - PROC(PATHCONF, fhandle, pathconfres, 0), - PROC(COMMIT, commitargs, commitres, 5), + PROC(GETATTR, getattr, getattr, 1), + PROC(SETATTR, setattr, setattr, 0), + PROC(LOOKUP, lookup, lookup, 2), + PROC(ACCESS, access, access, 1), + PROC(READLINK, readlink, readlink, 3), + PROC(READ, read, read, 3), + PROC(WRITE, write, write, 4), + PROC(CREATE, create, create, 0), + PROC(MKDIR, mkdir, create, 0), + PROC(SYMLINK, symlink, create, 0), + PROC(MKNOD, mknod, create, 0), + PROC(REMOVE, remove, remove, 0), + PROC(RMDIR, lookup, setattr, 0), + PROC(RENAME, rename, rename, 0), + PROC(LINK, link, link, 0), + PROC(READDIR, readdir, readdir, 3), + PROC(READDIRPLUS, readdirplus, readdir, 3), + PROC(FSSTAT, getattr, fsstat, 0), + PROC(FSINFO, getattr, fsinfo, 0), + PROC(PATHCONF, getattr, pathconf, 0), + PROC(COMMIT, commit, commit, 5), }; struct rpc_version nfs_version3 = { @@ -1185,8 +2468,8 @@ struct rpc_version nfs_version3 = { static struct rpc_procinfo nfs3_acl_procedures[] = { [ACLPROC3_GETACL] = { .p_proc = ACLPROC3_GETACL, - .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, - .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res, .p_arglen = ACL3_getaclargs_sz, .p_replen = ACL3_getaclres_sz, .p_timer = 1, @@ -1194,8 +2477,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = { }, [ACLPROC3_SETACL] = { .p_proc = ACLPROC3_SETACL, - .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, - .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res, .p_arglen = ACL3_setaclargs_sz, .p_replen = ACL3_setaclres_sz, .p_timer = 0, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9fa496387fd..7a747407314 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -44,6 +44,7 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_REBOOT, NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, + NFS4CLNT_LAYOUTRECALL, NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, }; @@ -109,7 +110,7 @@ struct nfs_unique_id { struct nfs4_state_owner { struct nfs_unique_id so_owner_id; struct nfs_server *so_server; - struct rb_node so_client_node; + struct rb_node so_server_node; struct rpc_cred *so_cred; /* Associated cred */ @@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops { extern const struct dentry_operations nfs4_dentry_operations; extern const struct inode_operations nfs4_dir_inode_operations; -/* inode.c */ -extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t); -extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int); -extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); - - /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); @@ -241,11 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); +extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); extern void nfs4_release_lockowner(const struct nfs4_lock_state *); +extern const struct xattr_handler *nfs4_xattr_handlers[]; #if defined(CONFIG_NFS_V4_1) static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) @@ -331,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid); extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ -extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); extern struct rpc_procinfo nfs4_procedures[]; struct nfs4_mount_data; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 2e92f0d8d65..23f930caf1e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, { struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; - struct nfs_server *nfss = NFS_SERVER(lo->inode); + struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); dprintk("--> %s\n", __func__); @@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, /* find and reference the deviceid */ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); if (dsaddr == NULL) { - dsaddr = get_device_info(lo->inode, id); + dsaddr = get_device_info(lo->plh_inode, id); if (dsaddr == NULL) goto out; } @@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, static void filelayout_free_lseg(struct pnfs_layout_segment *lseg) { - struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); + struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode); struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4435e5e1f90..9d992b0346e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -49,6 +49,7 @@ #include <linux/mount.h> #include <linux/module.h> #include <linux/sunrpc/bc_xprt.h> +#include <linux/xattr.h> #include "nfs4_fs.h" #include "delegation.h" @@ -355,9 +356,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) } /* - * Signal state manager thread if session is drained + * Signal state manager thread if session fore channel is drained */ -static void nfs41_check_drain_session_complete(struct nfs4_session *ses) +static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) { struct rpc_task *task; @@ -371,8 +372,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses) if (ses->fc_slot_table.highest_used_slotid != -1) return; - dprintk("%s COMPLETE: Session Drained\n", __func__); - complete(&ses->complete); + dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); + complete(&ses->fc_slot_table.complete); +} + +/* + * Signal state manager thread if session back channel is drained + */ +void nfs4_check_drain_bc_complete(struct nfs4_session *ses) +{ + if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || + ses->bc_slot_table.highest_used_slotid != -1) + return; + dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__); + complete(&ses->bc_slot_table.complete); } static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) @@ -389,7 +402,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slot); - nfs41_check_drain_session_complete(res->sr_session); + nfs4_check_drain_fc_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; } @@ -1826,6 +1839,8 @@ struct nfs4_closedata { struct nfs_closeres res; struct nfs_fattr fattr; unsigned long timestamp; + bool roc; + u32 roc_barrier; }; static void nfs4_free_closedata(void *data) @@ -1833,6 +1848,8 @@ static void nfs4_free_closedata(void *data) struct nfs4_closedata *calldata = data; struct nfs4_state_owner *sp = calldata->state->owner; + if (calldata->roc) + pnfs_roc_release(calldata->state->inode); nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); @@ -1865,6 +1882,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) */ switch (task->tk_status) { case 0: + if (calldata->roc) + pnfs_roc_set_barrier(state->inode, + calldata->roc_barrier); nfs_set_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); nfs4_close_clear_stateid_flags(state, @@ -1917,8 +1937,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) return; } - if (calldata->arg.fmode == 0) + if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + if (calldata->roc && + pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { + rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, + task, NULL); + return; + } + } nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; @@ -1946,7 +1973,7 @@ static const struct rpc_call_ops nfs4_close_ops = { * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) +int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; @@ -1981,11 +2008,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; + calldata->roc = roc; path_get(path); calldata->path = *path; - msg.rpc_argp = &calldata->arg, - msg.rpc_resp = &calldata->res, + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -1998,6 +2026,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i out_free_calldata: kfree(calldata); out: + if (roc) + pnfs_roc_release(state->inode); nfs4_put_open_state(state); nfs4_put_state_owner(sp); return status; @@ -2486,6 +2516,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, path = &ctx->path; fmode = ctx->mode; } + sattr->ia_mode &= ~current_umask(); state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); d_drop(dentry); if (IS_ERR(state)) { @@ -2816,6 +2847,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, { struct nfs4_exception exception = { }; int err; + + sattr->ia_mode &= ~current_umask(); do { err = nfs4_handle_exception(NFS_SERVER(dir), _nfs4_proc_mkdir(dir, dentry, sattr), @@ -2916,6 +2949,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, { struct nfs4_exception exception = { }; int err; + + sattr->ia_mode &= ~current_umask(); do { err = nfs4_handle_exception(NFS_SERVER(dir), _nfs4_proc_mknod(dir, dentry, sattr, rdev), @@ -3478,6 +3513,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, struct nfs4_setclientid setclientid = { .sc_verifier = &sc_verifier, .sc_prog = program, + .sc_cb_ident = clp->cl_cb_ident, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], @@ -3517,7 +3553,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, if (signalled()) break; if (loop++ & 1) - ssleep(clp->cl_lease_time + 1); + ssleep(clp->cl_lease_time / HZ + 1); else if (++clp->cl_id_uniquifier == 0) break; @@ -3663,8 +3699,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co data->rpc_status = 0; task_setup_data.callback_data = data; - msg.rpc_argp = &data->args, - msg.rpc_resp = &data->res, + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -3743,6 +3779,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock goto out; lsp = request->fl_u.nfs4_fl.owner; arg.lock_owner.id = lsp->ls_id.id; + arg.lock_owner.s_dev = server->s_dev; status = nfs4_call_sync(server, &msg, &arg, &res, 1); switch (status) { case 0: @@ -3908,8 +3945,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } - msg.rpc_argp = &data->arg, - msg.rpc_resp = &data->res, + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; task_setup_data.callback_data = data; return rpc_run_task(&task_setup_data); } @@ -3988,6 +4025,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_id.id; + p->arg.lock_owner.s_dev = server->s_dev; p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; @@ -4145,8 +4183,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->arg.reclaim = NFS_LOCK_RECLAIM; task_setup_data.callback_ops = &nfs4_recover_lock_ops; } - msg.rpc_argp = &data->arg, - msg.rpc_resp = &data->res, + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; task_setup_data.callback_data = data; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -4392,48 +4430,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) return; args->lock_owner.clientid = server->nfs_client->cl_clientid; args->lock_owner.id = lsp->ls_id.id; + args->lock_owner.s_dev = server->s_dev; msg.rpc_argp = args; rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); } #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" -int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, - size_t buflen, int flags) +static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) { - struct inode *inode = dentry->d_inode; - - if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) - return -EOPNOTSUPP; + if (strcmp(key, "") != 0) + return -EINVAL; - return nfs4_proc_set_acl(inode, buf, buflen); + return nfs4_proc_set_acl(dentry->d_inode, buf, buflen); } -/* The getxattr man page suggests returning -ENODATA for unknown attributes, - * and that's what we'll do for e.g. user attributes that haven't been set. - * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported - * attributes in kernel-managed attribute namespaces. */ -ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf, - size_t buflen) +static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) { - struct inode *inode = dentry->d_inode; - - if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) - return -EOPNOTSUPP; + if (strcmp(key, "") != 0) + return -EINVAL; - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_get_acl(dentry->d_inode, buf, buflen); } -ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) +static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) { - size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; + size_t len = sizeof(XATTR_NAME_NFSV4_ACL); if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) return 0; - if (buf && buflen < len) - return -ERANGE; - if (buf) - memcpy(buf, XATTR_NAME_NFSV4_ACL, len); + + if (list && len <= list_len) + memcpy(list, XATTR_NAME_NFSV4_ACL, len); return len; } @@ -4486,6 +4519,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, #ifdef CONFIG_NFS_V4_1 /* + * Check the exchange flags returned by the server for invalid flags, having + * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or + * DS flags set. + */ +static int nfs4_check_cl_exchange_flags(u32 flags) +{ + if (flags & ~EXCHGID4_FLAG_MASK_R) + goto out_inval; + if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && + (flags & EXCHGID4_FLAG_USE_NON_PNFS)) + goto out_inval; + if (!(flags & (EXCHGID4_FLAG_MASK_PNFS))) + goto out_inval; + return NFS_OK; +out_inval: + return -NFS4ERR_INVAL; +} + +/* * nfs4_proc_exchange_id() * * Since the clientid has expired, all compounds using sessions @@ -4498,7 +4550,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) nfs4_verifier verifier; struct nfs41_exchange_id_args args = { .client = clp, - .flags = clp->cl_exchange_flags, + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, }; struct nfs41_exchange_id_res res = { .client = clp, @@ -4515,9 +4567,6 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) dprintk("--> %s\n", __func__); BUG_ON(clp == NULL); - /* Remove server-only flags */ - args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R; - p = (u32 *)verifier.data; *p++ = htonl((u32)clp->cl_boot_time.tv_sec); *p = htonl((u32)clp->cl_boot_time.tv_nsec); @@ -4543,6 +4592,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) break; } + status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); dprintk("<-- %s status= %d\n", __func__, status); return status; } @@ -4776,17 +4826,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) if (!session) return NULL; - init_completion(&session->complete); - tbl = &session->fc_slot_table; tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); + init_completion(&tbl->complete); tbl = &session->bc_slot_table; tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + init_completion(&tbl->complete); session->session_state = 1<<NFS4_SESSION_INITING; @@ -5280,13 +5330,23 @@ static void nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct inode *ino = lgp->args.inode; - struct nfs_server *server = NFS_SERVER(ino); + struct nfs_server *server = NFS_SERVER(lgp->args.inode); dprintk("--> %s\n", __func__); + /* Note the is a race here, where a CB_LAYOUTRECALL can come in + * right now covering the LAYOUTGET we are about to send. + * However, that is not so catastrophic, and there seems + * to be no way to prevent it completely. + */ if (nfs4_setup_sequence(server, &lgp->args.seq_args, &lgp->res.seq_res, 0, task)) return; + if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + NFS_I(lgp->args.inode)->layout, + lgp->args.ctx->state)) { + rpc_exit(task, NFS4_OK); + return; + } rpc_call_start(task); } @@ -5313,7 +5373,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) return; } } - lgp->status = task->tk_status; dprintk("<-- %s\n", __func__); } @@ -5322,7 +5381,6 @@ static void nfs4_layoutget_release(void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - put_layout_hdr(lgp->args.inode); if (lgp->res.layout.buf != NULL) free_page((unsigned long) lgp->res.layout.buf); put_nfs_open_context(lgp->args.ctx); @@ -5367,13 +5425,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) if (IS_ERR(task)) return PTR_ERR(task); status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) - goto out; - status = lgp->status; - if (status != 0) - goto out; - status = pnfs_layout_process(lgp); -out: + if (status == 0) + status = task->tk_status; + if (status == 0) + status = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -5504,9 +5559,10 @@ static const struct inode_operations nfs4_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .getxattr = nfs4_getxattr, - .setxattr = nfs4_setxattr, - .listxattr = nfs4_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, }; const struct nfs_rpc_ops nfs_v4_clientops = { @@ -5551,6 +5607,18 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .open_context = nfs4_atomic_open, }; +static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { + .prefix = XATTR_NAME_NFSV4_ACL, + .list = nfs4_xattr_list_nfs4_acl, + .get = nfs4_xattr_get_nfs4_acl, + .set = nfs4_xattr_set_nfs4_acl, +}; + +const struct xattr_handler *nfs4_xattr_handlers[] = { + &nfs4_xattr_nfs4_acl_handler, + NULL +}; + /* * Local variables: * c-basic-offset: 8 diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 72b6c580af1..402143d75fc 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work) ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); - /* Are there any active superblocks? */ - if (list_empty(&clp->cl_superblocks)) + + rcu_read_lock(); + if (list_empty(&clp->cl_superblocks)) { + rcu_read_unlock(); goto out; + } + rcu_read_unlock(); + spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; last = clp->cl_last_renewal; @@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work) cred = ops->get_state_renewal_cred_locked(clp); spin_unlock(&clp->cl_lock); if (cred == NULL) { - if (list_empty(&clp->cl_delegations)) { + if (!nfs_delegations_present(clp)) { set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); goto out; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f575a312673..2336d532cf6 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -105,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp) put_rpccred(cred); } -struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +static struct rpc_cred * +nfs4_get_renew_cred_server_locked(struct nfs_server *server) { + struct rpc_cred *cred = NULL; struct nfs4_state_owner *sp; struct rb_node *pos; - struct rpc_cred *cred = NULL; - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); if (list_empty(&sp->so_states)) continue; cred = get_rpccred(sp->so_cred); @@ -121,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) return cred; } +/** + * nfs4_get_renew_cred_locked - Acquire credential for a renew operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + * Caller must hold clp->cl_lock. + */ +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + cred = nfs4_get_renew_cred_server_locked(server); + if (cred != NULL) + break; + } + rcu_read_unlock(); + return cred; +} + #if defined(CONFIG_NFS_V4_1) static int nfs41_setup_state_renewal(struct nfs_client *clp) @@ -142,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) return status; } +/* + * Back channel returns NFS4ERR_DELAY for new requests when + * NFS4_SESSION_DRAINING is set so there is no work to be done when draining + * is ended. + */ static void nfs4_end_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; @@ -165,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp) } } -static int nfs4_begin_drain_session(struct nfs_client *clp) +static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) { - struct nfs4_session *ses = clp->cl_session; - struct nfs4_slot_table *tbl = &ses->fc_slot_table; - spin_lock(&tbl->slot_tbl_lock); - set_bit(NFS4_SESSION_DRAINING, &ses->session_state); if (tbl->highest_used_slotid != -1) { - INIT_COMPLETION(ses->complete); + INIT_COMPLETION(tbl->complete); spin_unlock(&tbl->slot_tbl_lock); - return wait_for_completion_interruptible(&ses->complete); + return wait_for_completion_interruptible(&tbl->complete); } spin_unlock(&tbl->slot_tbl_lock); return 0; } +static int nfs4_begin_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + int ret = 0; + + set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + /* back channel */ + ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); + if (ret) + return ret; + /* fore channel */ + return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); +} + int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { int status; @@ -192,6 +232,12 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) status = nfs4_proc_create_session(clp); if (status != 0) goto out; + status = nfs4_set_callback_sessionid(clp); + if (status != 0) { + printk(KERN_WARNING "Sessionid not set. No callback service\n"); + nfs_callback_down(1); + status = 0; + } nfs41_setup_state_renewal(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: @@ -210,28 +256,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) #endif /* CONFIG_NFS_V4_1 */ -struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) +static struct rpc_cred * +nfs4_get_setclientid_cred_server(struct nfs_server *server) { + struct nfs_client *clp = server->nfs_client; + struct rpc_cred *cred = NULL; struct nfs4_state_owner *sp; struct rb_node *pos; + + spin_lock(&clp->cl_lock); + pos = rb_first(&server->state_owners); + if (pos != NULL) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + cred = get_rpccred(sp->so_cred); + } + spin_unlock(&clp->cl_lock); + return cred; +} + +/** + * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + */ +struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) +{ + struct nfs_server *server; struct rpc_cred *cred; spin_lock(&clp->cl_lock); cred = nfs4_get_machine_cred_locked(clp); + spin_unlock(&clp->cl_lock); if (cred != NULL) goto out; - pos = rb_first(&clp->cl_state_owners); - if (pos != NULL) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - cred = get_rpccred(sp->so_cred); + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + cred = nfs4_get_setclientid_cred_server(server); + if (cred != NULL) + break; } + rcu_read_unlock(); + out: - spin_unlock(&clp->cl_lock); return cred; } -static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, - __u64 minval, int maxbits) +static void nfs_alloc_unique_id_locked(struct rb_root *root, + struct nfs_unique_id *new, + __u64 minval, int maxbits) { struct rb_node **p, *parent; struct nfs_unique_id *pos; @@ -286,16 +360,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) } static struct nfs4_state_owner * -nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) +nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) { - struct nfs_client *clp = server->nfs_client; - struct rb_node **p = &clp->cl_state_owners.rb_node, + struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp, *res = NULL; while (*p != NULL) { parent = *p; - sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); if (server < sp->so_server) { p = &parent->rb_left; @@ -319,24 +392,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) } static struct nfs4_state_owner * -nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) +nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) { - struct rb_node **p = &clp->cl_state_owners.rb_node, + struct nfs_server *server = new->so_server; + struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; while (*p != NULL) { parent = *p; - sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); - if (new->so_server < sp->so_server) { - p = &parent->rb_left; - continue; - } - if (new->so_server > sp->so_server) { - p = &parent->rb_right; - continue; - } if (new->so_cred < sp->so_cred) p = &parent->rb_left; else if (new->so_cred > sp->so_cred) @@ -346,18 +412,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) return sp; } } - nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); - rb_link_node(&new->so_client_node, parent, p); - rb_insert_color(&new->so_client_node, &clp->cl_state_owners); + nfs_alloc_unique_id_locked(&server->openowner_id, + &new->so_owner_id, 1, 64); + rb_link_node(&new->so_server_node, parent, p); + rb_insert_color(&new->so_server_node, &server->state_owners); return new; } static void -nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) +nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) { - if (!RB_EMPTY_NODE(&sp->so_client_node)) - rb_erase(&sp->so_client_node, &clp->cl_state_owners); - nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); + struct nfs_server *server = sp->so_server; + + if (!RB_EMPTY_NODE(&sp->so_server_node)) + rb_erase(&sp->so_server_node, &server->state_owners); + nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id); } /* @@ -386,23 +455,32 @@ nfs4_alloc_state_owner(void) static void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { - if (!RB_EMPTY_NODE(&sp->so_client_node)) { - struct nfs_client *clp = sp->so_server->nfs_client; + if (!RB_EMPTY_NODE(&sp->so_server_node)) { + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - rb_erase(&sp->so_client_node, &clp->cl_state_owners); - RB_CLEAR_NODE(&sp->so_client_node); + rb_erase(&sp->so_server_node, &server->state_owners); + RB_CLEAR_NODE(&sp->so_server_node); spin_unlock(&clp->cl_lock); } } -struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) +/** + * nfs4_get_state_owner - Look up a state owner given a credential + * @server: nfs_server to search + * @cred: RPC credential to match + * + * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. + */ +struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, + struct rpc_cred *cred) { struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *new; spin_lock(&clp->cl_lock); - sp = nfs4_find_state_owner(server, cred); + sp = nfs4_find_state_owner_locked(server, cred); spin_unlock(&clp->cl_lock); if (sp != NULL) return sp; @@ -412,7 +490,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner(clp, new); + sp = nfs4_insert_state_owner_locked(new); spin_unlock(&clp->cl_lock); if (sp == new) get_rpccred(cred); @@ -423,6 +501,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct return sp; } +/** + * nfs4_put_state_owner - Release a nfs4_state_owner + * @sp: state owner data to release + * + */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { struct nfs_client *clp = sp->so_server->nfs_client; @@ -430,7 +513,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - nfs4_remove_state_owner(clp, sp); + nfs4_remove_state_owner_locked(sp); spin_unlock(&clp->cl_lock); rpc_destroy_wait_queue(&sp->so_sequence.wait); put_rpccred(cred); @@ -585,8 +668,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, if (!call_close) { nfs4_put_open_state(state); nfs4_put_state_owner(owner); - } else - nfs4_do_close(path, state, gfp_mask, wait); + } else { + bool roc = pnfs_roc(state->inode); + + nfs4_do_close(path, state, gfp_mask, wait, roc); + } } void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) @@ -633,7 +719,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *lsp; - struct nfs_client *clp = state->owner->so_server->nfs_client; + struct nfs_server *server = state->owner->so_server; + struct nfs_client *clp = server->nfs_client; lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) @@ -657,7 +744,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f return NULL; } spin_lock(&clp->cl_lock); - nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64); spin_unlock(&clp->cl_lock); INIT_LIST_HEAD(&lsp->ls_locks); return lsp; @@ -665,10 +752,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) { - struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + struct nfs_server *server = lsp->ls_state->owner->so_server; + struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); + nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id); spin_unlock(&clp->cl_lock); rpc_destroy_wait_queue(&lsp->ls_sequence.wait); kfree(lsp); @@ -1114,15 +1202,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state) } } -static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +static void nfs4_reset_seqids(struct nfs_server *server, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) { + struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; - /* Reset all sequence ids to zero */ - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); sp->so_seqid.flags = 0; spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { @@ -1131,6 +1223,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re } spin_unlock(&sp->so_lock); } + spin_unlock(&clp->cl_lock); +} + +static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_reset_seqids(server, mark_reclaim); + rcu_read_unlock(); } static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) @@ -1148,25 +1252,41 @@ static void nfs4_reclaim_complete(struct nfs_client *clp, (void)ops->reclaim_complete(clp); } -static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +static void nfs4_clear_reclaim_server(struct nfs_server *server) { + struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; - if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) - return 0; - - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { - if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags)) + if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, + &state->flags)) continue; nfs4_state_mark_reclaim_nograce(clp, state); } spin_unlock(&sp->so_lock); } + spin_unlock(&clp->cl_lock); +} + +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +{ + struct nfs_server *server; + + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_clear_reclaim_server(server); + rcu_read_unlock(); nfs_delegation_reap_unclaimed(clp); return 1; @@ -1238,27 +1358,40 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) { + struct nfs4_state_owner *sp; + struct nfs_server *server; struct rb_node *pos; int status = 0; restart: - spin_lock(&clp->cl_lock); - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) - continue; - atomic_inc(&sp->so_count); - spin_unlock(&clp->cl_lock); - status = nfs4_reclaim_open_state(sp, ops); - if (status < 0) { - set_bit(ops->owner_flag_bit, &sp->so_flags); + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, + struct nfs4_state_owner, so_server_node); + if (!test_and_clear_bit(ops->owner_flag_bit, + &sp->so_flags)) + continue; + atomic_inc(&sp->so_count); + spin_unlock(&clp->cl_lock); + rcu_read_unlock(); + + status = nfs4_reclaim_open_state(sp, ops); + if (status < 0) { + set_bit(ops->owner_flag_bit, &sp->so_flags); + nfs4_put_state_owner(sp); + return nfs4_recovery_handle_error(clp, status); + } + nfs4_put_state_owner(sp); - return nfs4_recovery_handle_error(clp, status); + goto restart; } - nfs4_put_state_owner(sp); - goto restart; + spin_unlock(&clp->cl_lock); } - spin_unlock(&clp->cl_lock); + rcu_read_unlock(); return status; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9f1826b012e..2ab8e5cb8f5 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -71,8 +71,8 @@ static int nfs4_stat_to_errno(int); /* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define open_owner_id_maxsz (1 + 4) -#define lock_owner_id_maxsz (1 + 4) +#define open_owner_id_maxsz (1 + 1 + 4) +#define lock_owner_id_maxsz (1 + 1 + 4) #define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) @@ -1088,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo { __be32 *p; - p = reserve_space(xdr, 28); + p = reserve_space(xdr, 32); p = xdr_encode_hyper(p, lowner->clientid); - *p++ = cpu_to_be32(16); + *p++ = cpu_to_be32(20); p = xdr_encode_opaque_fixed(p, "lock id:", 8); + *p++ = cpu_to_be32(lowner->s_dev); xdr_encode_hyper(p, lowner->id); } @@ -1210,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena *p++ = cpu_to_be32(OP_OPEN); *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); - p = reserve_space(xdr, 28); + p = reserve_space(xdr, 32); p = xdr_encode_hyper(p, arg->clientid); - *p++ = cpu_to_be32(16); + *p++ = cpu_to_be32(20); p = xdr_encode_opaque_fixed(p, "open id:", 8); + *p++ = cpu_to_be32(arg->server->s_dev); xdr_encode_hyper(p, arg->id); } @@ -1510,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) hdr->replen += decode_restorefh_maxsz; } -static int +static void encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) { __be32 *p; @@ -1521,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); *p = cpu_to_be32(FATTR4_WORD0_ACL); - if (arg->acl_len % 4) - return -EINVAL; + BUG_ON(arg->acl_len % 4); p = reserve_space(xdr, 4); *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; - return 0; } static void @@ -1789,7 +1789,6 @@ encode_layoutget(struct xdr_stream *xdr, const struct nfs4_layoutget_args *args, struct compound_hdr *hdr) { - nfs4_stateid stateid; __be32 *p; p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); @@ -1800,9 +1799,7 @@ encode_layoutget(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->range.offset); p = xdr_encode_hyper(p, args->range.length); p = xdr_encode_hyper(p, args->minlength); - pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, - args->ctx->state); - p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); *p = cpu_to_be32(args->maxcount); dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", @@ -1833,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) /* * Encode an ACCESS request */ -static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args) +static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_accessargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_access(&xdr, args->access, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_access(xdr, args->access, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode LOOKUP request */ -static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args) +static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_lookup_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->dir_fh, &hdr); - encode_lookup(&xdr, args->name, &hdr); - encode_getfh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode LOOKUP_ROOT request */ -static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args) +static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_lookup_root_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putrootfh(&xdr, &hdr); - encode_getfh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode REMOVE request */ -static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_removeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_remove(&xdr, &args->name, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_remove(xdr, &args->name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode RENAME request */ -static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args) +static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_renameargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->old_dir, &hdr); - encode_savefh(&xdr, &hdr); - encode_putfh(&xdr, args->new_dir, &hdr); - encode_rename(&xdr, args->old_name, args->new_name, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); - encode_restorefh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->old_dir, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->new_dir, &hdr); + encode_rename(xdr, args->old_name, args->new_name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode LINK request */ -static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args) +static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_link_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_savefh(&xdr, &hdr); - encode_putfh(&xdr, args->dir_fh, &hdr); - encode_link(&xdr, args->name, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); - encode_restorefh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_link(xdr, args->name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode CREATE request */ -static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) +static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->dir_fh, &hdr); - encode_savefh(&xdr, &hdr); - encode_create(&xdr, args, &hdr); - encode_getfh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); - encode_restorefh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_create(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode SYMLINK request */ -static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) +static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) { - return nfs4_xdr_enc_create(req, p, args); + nfs4_xdr_enc_create(req, xdr, args); } /* * Encode GETATTR request */ -static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args) +static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_getattr_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode a CLOSE request */ -static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) +static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_closeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_close(&xdr, args, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_close(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode an OPEN request */ -static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) +static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_openargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_savefh(&xdr, &hdr); - encode_open(&xdr, args, &hdr); - encode_getfh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); - encode_restorefh(&xdr, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_savefh(xdr, &hdr); + encode_open(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode an OPEN_CONFIRM request */ -static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args) +static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_open_confirmargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 0, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_open_confirm(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_confirm(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * Encode an OPEN request with no attributes. */ -static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) +static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_openargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_open(&xdr, args, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode an OPEN_DOWNGRADE request */ -static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) +static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_closeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_open_downgrade(&xdr, args, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_downgrade(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode a LOCK request */ -static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args) +static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lock_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_lock(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lock(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * Encode a LOCKT request */ -static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args) +static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lockt_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_lockt(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lockt(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * Encode a LOCKU request */ -static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args) +static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_locku_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_locku(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_locku(xdr, args, &hdr); encode_nops(&hdr); - return 0; } -static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) +static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_release_lockowner_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = 0, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_release_lockowner(&xdr, &args->lock_owner, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_release_lockowner(xdr, &args->lock_owner, &hdr); encode_nops(&hdr); - return 0; } /* * Encode a READLINK request */ -static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args) +static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readlink *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_readlink(&xdr, args, req, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readlink(xdr, args, req, &hdr); xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, args->pgbase, args->pglen); encode_nops(&hdr); - return 0; } /* * Encode a READDIR request */ -static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args) +static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readdir_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_readdir(&xdr, args, req, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readdir(xdr, args, req, &hdr); xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, args->pgbase, args->count); @@ -2227,428 +2193,387 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf __func__, hdr.replen << 2, args->pages, args->pgbase, args->count); encode_nops(&hdr); - return 0; } /* * Encode a READ request */ -static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_read(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_read(xdr, args, &hdr); xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, args->pgbase, args->count); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); - return 0; } /* * Encode an SETATTR request */ -static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) +static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setattrargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_setattr(&xdr, args, args->server, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setattr(xdr, args, args->server, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode a GETACL request */ -static int -nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, - struct nfs_getaclargs *args) +static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_getaclargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; uint32_t replen; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; - encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); + encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, args->acl_pgbase, args->acl_len); encode_nops(&hdr); - return 0; } /* * Encode a WRITE request */ -static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_write(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_write(xdr, args, &hdr); req->rq_snd_buf.flags |= XDRBUF_WRITE; - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * a COMMIT request */ -static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_commit(&xdr, args, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_commit(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * FSINFO request */ -static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args) +static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_fsinfo(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_fsinfo(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * a PATHCONF request */ -static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args) +static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_pathconf_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], &hdr); encode_nops(&hdr); - return 0; } /* * a STATFS request */ -static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args) +static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_statfs_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); encode_nops(&hdr); - return 0; } /* * GETATTR_BITMAP request */ -static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, - struct nfs4_server_caps_arg *args) +static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fhandle, &hdr); - encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| FATTR4_WORD0_LINK_SUPPORT| FATTR4_WORD0_SYMLINK_SUPPORT| FATTR4_WORD0_ACLSUPPORT, &hdr); encode_nops(&hdr); - return 0; } /* * a RENEW request */ -static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) +static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_client *clp) { - struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 0, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_renew(&xdr, clp, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_renew(xdr, clp, &hdr); encode_nops(&hdr); - return 0; } /* * a SETCLIENTID request */ -static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc) +static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid *sc) { - struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 0, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_setclientid(&xdr, sc, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid(xdr, sc, &hdr); encode_nops(&hdr); - return 0; } /* * a SETCLIENTID_CONFIRM request */ -static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) +static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *arg) { - struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 0, }; const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_setclientid_confirm(&xdr, arg, &hdr); - encode_putrootfh(&xdr, &hdr); - encode_fsinfo(&xdr, lease_bitmap, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid_confirm(xdr, arg, &hdr); + encode_putrootfh(xdr, &hdr); + encode_fsinfo(xdr, lease_bitmap, &hdr); encode_nops(&hdr); - return 0; } /* * DELEGRETURN request */ -static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args) +static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_delegreturnargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fhandle, &hdr); - encode_delegreturn(&xdr, args->stateid, &hdr); - encode_getfattr(&xdr, args->bitmask, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_delegreturn(xdr, args->stateid, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; } /* * Encode FS_LOCATIONS request */ -static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args) +static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; uint32_t replen; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->dir_fh, &hdr); - encode_lookup(&xdr, args->name, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); replen = hdr.replen; /* get the attribute into args->page */ - encode_fs_locations(&xdr, args->bitmask, &hdr); + encode_fs_locations(xdr, args->bitmask, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 0, PAGE_SIZE); encode_nops(&hdr); - return 0; } #if defined(CONFIG_NFS_V4_1) /* * EXCHANGE_ID request */ -static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, - struct nfs41_exchange_id_args *args) +static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = args->client->cl_mvops->minor_version, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_exchange_id(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_exchange_id(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * a CREATE_SESSION request */ -static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, - struct nfs41_create_session_args *args) +static void nfs4_xdr_enc_create_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_create_session_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = args->client->cl_mvops->minor_version, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_create_session(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_create_session(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * a DESTROY_SESSION request */ -static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, - struct nfs4_session *session) +static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_session *session) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = session->clp->cl_mvops->minor_version, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_destroy_session(&xdr, session, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_session(xdr, session, &hdr); encode_nops(&hdr); - return 0; } /* * a SEQUENCE request */ -static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p, - struct nfs4_sequence_args *args) +static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_sequence_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * a GET_LEASE_TIME request */ -static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, - struct nfs4_get_lease_time_args *args) +static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), }; const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->la_seq_args, &hdr); - encode_putrootfh(&xdr, &hdr); - encode_fsinfo(&xdr, lease_bitmap, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->la_seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_fsinfo(xdr, lease_bitmap, &hdr); encode_nops(&hdr); - return 0; } /* * a RECLAIM_COMPLETE request */ -static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, - struct nfs41_reclaim_complete_args *args) +static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args) }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_reclaim_complete(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_reclaim_complete(xdr, args, &hdr); encode_nops(&hdr); - return 0; } /* * Encode GETDEVICEINFO request */ -static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, - struct nfs4_getdeviceinfo_args *args) +static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_getdeviceinfo(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(xdr, args, &hdr); /* set up reply kvec. Subtract notification bitmap max size (2) * so that notification bitmap is put in xdr_buf tail */ @@ -2657,27 +2582,24 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, args->pdev->pglen); encode_nops(&hdr); - return 0; } /* * Encode LAYOUTGET request */ -static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, - struct nfs4_layoutget_args *args) +static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutget_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, NFS_FH(args->inode), &hdr); - encode_layoutget(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(xdr, args, &hdr); encode_nops(&hdr); - return 0; } #endif /* CONFIG_NFS_V4_1 */ @@ -4475,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ goto out_overflow; eof = be32_to_cpup(p++); count = be32_to_cpup(p); - hdrlen = (u8 *) p - (u8 *) iov->iov_base; + hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { dprintk("NFS: server cheating in read reply: " @@ -5000,7 +4922,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, goto out_overflow; len = be32_to_cpup(p); if (len) { - int i; + uint32_t i; p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) @@ -5090,26 +5012,26 @@ out_overflow: /* * Decode OPEN_DOWNGRADE response */ -static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) +static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_closeres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_open_downgrade(&xdr, res); + status = decode_open_downgrade(xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5118,26 +5040,25 @@ out: /* * Decode ACCESS response */ -static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) +static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_accessres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_access(&xdr, res); + status = decode_access(xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5146,26 +5067,28 @@ out: /* * Decode LOOKUP response */ -static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) +static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_lookup_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_lookup(&xdr)) != 0) + status = decode_lookup(xdr); + if (status) goto out; - if ((status = decode_getfh(&xdr, res->fh)) != 0) + status = decode_getfh(xdr, res->fh); + if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server + status = decode_getfattr(xdr, res->fattr, res->server ,!RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5174,23 +5097,25 @@ out: /* * Decode LOOKUP_ROOT response */ -static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) +static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_lookup_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - if ((status = decode_putrootfh(&xdr)) != 0) + status = decode_putrootfh(xdr); + if (status) goto out; - if ((status = decode_getfh(&xdr, res->fh)) == 0) - status = decode_getfattr(&xdr, res->fattr, res->server, + status = decode_getfh(xdr, res->fh); + if (status == 0) + status = decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5199,24 +5124,25 @@ out: /* * Decode REMOVE response */ -static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res) +static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_removeres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_remove(&xdr, &res->cinfo)) != 0) + status = decode_remove(xdr, &res->cinfo); + if (status) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server, + decode_getfattr(xdr, res->dir_attr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5225,34 +5151,38 @@ out: /* * Decode RENAME response */ -static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res) +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_renameres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_savefh(&xdr)) != 0) + status = decode_savefh(xdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) + status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo); + if (status) goto out; /* Current FH is target directory */ - if (decode_getfattr(&xdr, res->new_fattr, res->server, + if (decode_getfattr(xdr, res->new_fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; - if ((status = decode_restorefh(&xdr)) != 0) + status = decode_restorefh(xdr); + if (status) goto out; - decode_getfattr(&xdr, res->old_fattr, res->server, + decode_getfattr(xdr, res->old_fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5261,37 +5191,41 @@ out: /* * Decode LINK response */ -static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res) +static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_link_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_savefh(&xdr)) != 0) + status = decode_savefh(xdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_link(&xdr, &res->cinfo)) != 0) + status = decode_link(xdr, &res->cinfo); + if (status) goto out; /* * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(&xdr, res->dir_attr, res->server, + if (decode_getfattr(xdr, res->dir_attr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; - if ((status = decode_restorefh(&xdr)) != 0) + status = decode_restorefh(xdr); + if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5300,33 +5234,37 @@ out: /* * Decode CREATE response */ -static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) +static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_savefh(&xdr)) != 0) + status = decode_savefh(xdr); + if (status) goto out; - if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) + status = decode_create(xdr, &res->dir_cinfo); + if (status) goto out; - if ((status = decode_getfh(&xdr, res->fh)) != 0) + status = decode_getfh(xdr, res->fh); + if (status) goto out; - if (decode_getfattr(&xdr, res->fattr, res->server, + if (decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; - if ((status = decode_restorefh(&xdr)) != 0) + status = decode_restorefh(xdr); + if (status) goto out; - decode_getfattr(&xdr, res->dir_fattr, res->server, + decode_getfattr(xdr, res->dir_fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5335,31 +5273,31 @@ out: /* * Decode SYMLINK response */ -static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) +static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) { - return nfs4_xdr_dec_create(rqstp, p, res); + return nfs4_xdr_dec_create(rqstp, xdr, res); } /* * Decode GETATTR response */ -static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res) +static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_getattr_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server, + status = decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5368,46 +5306,40 @@ out: /* * Encode an SETACL request */ -static int -nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) +static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setaclargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); - encode_sequence(&xdr, &args->seq_args, &hdr); - encode_putfh(&xdr, args->fh, &hdr); - status = encode_setacl(&xdr, args, &hdr); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setacl(xdr, args, &hdr); encode_nops(&hdr); - return status; } /* * Decode SETACL response */ static int -nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, +nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct nfs_setaclres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_setattr(&xdr); + status = decode_setattr(xdr); out: return status; } @@ -5416,24 +5348,22 @@ out: * Decode GETACL response */ static int -nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, +nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct nfs_getaclres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(&xdr, rqstp, &res->acl_len); + status = decode_getacl(xdr, rqstp, &res->acl_len); out: return status; @@ -5442,23 +5372,22 @@ out: /* * Decode CLOSE response */ -static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) +static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_closeres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_close(&xdr, res); + status = decode_close(xdr, res); if (status != 0) goto out; /* @@ -5467,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5476,36 +5405,35 @@ out: /* * Decode OPEN response */ -static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) +static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_openres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_savefh(&xdr); + status = decode_savefh(xdr); if (status) goto out; - status = decode_open(&xdr, res); + status = decode_open(xdr, res); if (status) goto out; - if (decode_getfh(&xdr, &res->fh) != 0) + if (decode_getfh(xdr, &res->fh) != 0) goto out; - if (decode_getfattr(&xdr, res->f_attr, res->server, + if (decode_getfattr(xdr, res->f_attr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; - if (decode_restorefh(&xdr) != 0) + if (decode_restorefh(xdr) != 0) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server, + decode_getfattr(xdr, res->dir_attr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5514,20 +5442,20 @@ out: /* * Decode OPEN_CONFIRM response */ -static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) +static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_open_confirmres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_open_confirm(&xdr, res); + status = decode_open_confirm(xdr, res); out: return status; } @@ -5535,26 +5463,26 @@ out: /* * Decode OPEN response */ -static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) +static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_openres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_open(&xdr, res); + status = decode_open(xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->f_attr, res->server, + decode_getfattr(xdr, res->f_attr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5563,26 +5491,26 @@ out: /* * Decode SETATTR response */ -static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) +static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_setattrres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_setattr(&xdr); + status = decode_setattr(xdr); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5591,23 +5519,22 @@ out: /* * Decode LOCK response */ -static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res) +static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lock_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_lock(&xdr, res); + status = decode_lock(xdr, res); out: return status; } @@ -5615,23 +5542,22 @@ out: /* * Decode LOCKT response */ -static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res) +static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lockt_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_lockt(&xdr, res); + status = decode_lockt(xdr, res); out: return status; } @@ -5639,61 +5565,58 @@ out: /* * Decode LOCKU response */ -static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res) +static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_locku_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_locku(&xdr, res); + status = decode_locku(xdr, res); out: return status; } -static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *dummy) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_release_lockowner(&xdr); + status = decode_release_lockowner(xdr); return status; } /* * Decode READLINK response */ -static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, +static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs4_readlink_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_readlink(&xdr, rqstp); + status = decode_readlink(xdr, rqstp); out: return status; } @@ -5701,23 +5624,22 @@ out: /* * Decode READDIR response */ -static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res) +static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_readdir_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_readdir(&xdr, rqstp, res); + status = decode_readdir(xdr, rqstp, res); out: return status; } @@ -5725,23 +5647,22 @@ out: /* * Decode Read response */ -static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res) +static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_readres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_read(&xdr, rqstp, res); + status = decode_read(xdr, rqstp, res); if (!status) status = res->count; out: @@ -5751,26 +5672,25 @@ out: /* * Decode WRITE response */ -static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) +static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_writeres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_write(&xdr, res); + status = decode_write(xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; @@ -5781,26 +5701,25 @@ out: /* * Decode COMMIT response */ -static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) +static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_writeres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_commit(&xdr, res); + status = decode_commit(xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5809,85 +5728,80 @@ out: /* * Decode FSINFO response */ -static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs4_fsinfo_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_sequence(&xdr, &res->seq_res, req); + status = decode_sequence(xdr, &res->seq_res, req); if (!status) - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (!status) - status = decode_fsinfo(&xdr, res->fsinfo); + status = decode_fsinfo(xdr, res->fsinfo); return status; } /* * Decode PATHCONF response */ -static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs4_pathconf_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_sequence(&xdr, &res->seq_res, req); + status = decode_sequence(xdr, &res->seq_res, req); if (!status) - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (!status) - status = decode_pathconf(&xdr, res->pathconf); + status = decode_pathconf(xdr, res->pathconf); return status; } /* * Decode STATFS response */ -static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs4_statfs_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_sequence(&xdr, &res->seq_res, req); + status = decode_sequence(xdr, &res->seq_res, req); if (!status) - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (!status) - status = decode_statfs(&xdr, res->fsstat); + status = decode_statfs(xdr, res->fsstat); return status; } /* * Decode GETATTR_BITMAP response */ -static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) +static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, req); + status = decode_sequence(xdr, &res->seq_res, req); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - status = decode_server_caps(&xdr, res); + status = decode_server_caps(xdr, res); out: return status; } @@ -5895,79 +5809,77 @@ out: /* * Decode RENEW response */ -static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + void *__unused) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_renew(&xdr); + status = decode_renew(xdr); return status; } /* * Decode SETCLIENTID response */ -static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, - struct nfs4_setclientid_res *res) +static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_setclientid(&xdr, res); + status = decode_setclientid(xdr, res); return status; } /* * Decode SETCLIENTID_CONFIRM response */ -static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) +static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsinfo *fsinfo) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_setclientid_confirm(&xdr); + status = decode_setclientid_confirm(xdr); if (!status) - status = decode_putrootfh(&xdr); + status = decode_putrootfh(xdr); if (!status) - status = decode_fsinfo(&xdr, fsinfo); + status = decode_fsinfo(xdr, fsinfo); return status; } /* * Decode DELEGRETURN response */ -static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) +static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_delegreturnres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_delegreturn(&xdr); + status = decode_delegreturn(xdr); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server, + decode_getfattr(xdr, res->fattr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5976,26 +5888,27 @@ out: /* * Decode FS_LOCATIONS response */ -static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, struct nfs4_fs_locations_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, req); + status = decode_sequence(xdr, &res->seq_res, req); if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_lookup(&xdr)) != 0) + status = decode_lookup(xdr); + if (status) goto out; - xdr_enter_page(&xdr, PAGE_SIZE); - status = decode_getfattr(&xdr, &res->fs_locations->fattr, + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr(xdr, &res->fs_locations->fattr, res->fs_locations->server, !RPC_IS_ASYNC(req->rq_task)); out: @@ -6006,129 +5919,122 @@ out: /* * Decode EXCHANGE_ID response */ -static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_exchange_id(&xdr, res); + status = decode_exchange_id(xdr, res); return status; } /* * Decode CREATE_SESSION response */ -static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs41_create_session_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_create_session(&xdr, res); + status = decode_create_session(xdr, res); return status; } /* * Decode DESTROY_SESSION response */ -static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, - void *dummy) +static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_destroy_session(&xdr, dummy); + status = decode_destroy_session(xdr, res); return status; } /* * Decode SEQUENCE response */ -static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs4_sequence_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_sequence(&xdr, res, rqstp); + status = decode_sequence(xdr, res, rqstp); return status; } /* * Decode GET_LEASE_TIME response */ -static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs4_get_lease_time_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_sequence(&xdr, &res->lr_seq_res, rqstp); + status = decode_sequence(xdr, &res->lr_seq_res, rqstp); if (!status) - status = decode_putrootfh(&xdr); + status = decode_putrootfh(xdr); if (!status) - status = decode_fsinfo(&xdr, res->lr_fsinfo); + status = decode_fsinfo(xdr, res->lr_fsinfo); return status; } /* * Decode RECLAIM_COMPLETE response */ -static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs41_reclaim_complete_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (!status) - status = decode_reclaim_complete(&xdr, (void *)NULL); + status = decode_reclaim_complete(xdr, (void *)NULL); return status; } /* * Decode GETDEVINFO response */ -static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs4_getdeviceinfo_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status != 0) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_getdeviceinfo(&xdr, res->pdev); + status = decode_getdeviceinfo(xdr, res->pdev); out: return status; } @@ -6136,31 +6042,44 @@ out: /* * Decode LAYOUTGET response */ -static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, struct nfs4_layoutget_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_sequence(&xdr, &res->seq_res, rqstp); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_layoutget(&xdr, rqstp, res); + status = decode_layoutget(xdr, rqstp, res); out: return status; } #endif /* CONFIG_NFS_V4_1 */ -__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, - struct nfs_server *server, int plus) +/** + * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + */ +int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) { uint32_t bitmap[2] = {0}; uint32_t len; @@ -6172,9 +6091,9 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (unlikely(!p)) goto out_overflow; if (!ntohl(*p++)) - return ERR_PTR(-EAGAIN); + return -EAGAIN; entry->eof = 1; - return ERR_PTR(-EBADCOOKIE); + return -EBADCOOKIE; } p = xdr_inline_decode(xdr, 12); @@ -6203,7 +6122,8 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (decode_attr_length(xdr, &len, &p) < 0) goto out_overflow; - if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, + entry->server, 1) < 0) goto out_overflow; if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) entry->ino = entry->fattr->fileid; @@ -6215,17 +6135,11 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (verify_attr_len(xdr, p, len) < 0) goto out_overflow; - p = xdr_inline_peek(xdr, 8); - if (p != NULL) - entry->eof = !p[0] && p[1]; - else - entry->eof = 0; - - return p; + return 0; out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } /* @@ -6301,8 +6215,8 @@ nfs4_stat_to_errno(int stat) #define PROC(proc, argtype, restype) \ [NFSPROC4_CLNT_##proc] = { \ .p_proc = NFSPROC4_COMPOUND, \ - .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ + .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \ .p_arglen = NFS4_##argtype##_sz, \ .p_replen = NFS4_##restype##_sz, \ .p_statidx = NFSPROC4_CLNT_##proc, \ @@ -6310,50 +6224,50 @@ nfs4_stat_to_errno(int stat) } struct rpc_procinfo nfs4_procedures[] = { - PROC(READ, enc_read, dec_read), - PROC(WRITE, enc_write, dec_write), - PROC(COMMIT, enc_commit, dec_commit), - PROC(OPEN, enc_open, dec_open), - PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), - PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), - PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), - PROC(CLOSE, enc_close, dec_close), - PROC(SETATTR, enc_setattr, dec_setattr), - PROC(FSINFO, enc_fsinfo, dec_fsinfo), - PROC(RENEW, enc_renew, dec_renew), - PROC(SETCLIENTID, enc_setclientid, dec_setclientid), - PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), - PROC(LOCK, enc_lock, dec_lock), - PROC(LOCKT, enc_lockt, dec_lockt), - PROC(LOCKU, enc_locku, dec_locku), - PROC(ACCESS, enc_access, dec_access), - PROC(GETATTR, enc_getattr, dec_getattr), - PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), - PROC(REMOVE, enc_remove, dec_remove), - PROC(RENAME, enc_rename, dec_rename), - PROC(LINK, enc_link, dec_link), - PROC(SYMLINK, enc_symlink, dec_symlink), - PROC(CREATE, enc_create, dec_create), - PROC(PATHCONF, enc_pathconf, dec_pathconf), - PROC(STATFS, enc_statfs, dec_statfs), - PROC(READLINK, enc_readlink, dec_readlink), - PROC(READDIR, enc_readdir, dec_readdir), - PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), - PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), - PROC(GETACL, enc_getacl, dec_getacl), - PROC(SETACL, enc_setacl, dec_setacl), - PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), - PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(READ, enc_read, dec_read), + PROC(WRITE, enc_write, dec_write), + PROC(COMMIT, enc_commit, dec_commit), + PROC(OPEN, enc_open, dec_open), + PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), + PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), + PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), + PROC(CLOSE, enc_close, dec_close), + PROC(SETATTR, enc_setattr, dec_setattr), + PROC(FSINFO, enc_fsinfo, dec_fsinfo), + PROC(RENEW, enc_renew, dec_renew), + PROC(SETCLIENTID, enc_setclientid, dec_setclientid), + PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), + PROC(LOCK, enc_lock, dec_lock), + PROC(LOCKT, enc_lockt, dec_lockt), + PROC(LOCKU, enc_locku, dec_locku), + PROC(ACCESS, enc_access, dec_access), + PROC(GETATTR, enc_getattr, dec_getattr), + PROC(LOOKUP, enc_lookup, dec_lookup), + PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), + PROC(REMOVE, enc_remove, dec_remove), + PROC(RENAME, enc_rename, dec_rename), + PROC(LINK, enc_link, dec_link), + PROC(SYMLINK, enc_symlink, dec_symlink), + PROC(CREATE, enc_create, dec_create), + PROC(PATHCONF, enc_pathconf, dec_pathconf), + PROC(STATFS, enc_statfs, dec_statfs), + PROC(READLINK, enc_readlink, dec_readlink), + PROC(READDIR, enc_readdir, dec_readdir), + PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), + PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), #if defined(CONFIG_NFS_V4_1) - PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), - PROC(CREATE_SESSION, enc_create_session, dec_create_session), - PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), - PROC(SEQUENCE, enc_sequence, dec_sequence), - PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), - PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), - PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), + PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b68536cc904..e1164e3f9e6 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep; static inline struct nfs_page * nfs_page_alloc(void) { - struct nfs_page *p; - p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); - if (p) { - memset(p, 0, sizeof(*p)); + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + if (p) INIT_LIST_HEAD(&p->wb_list); - } return p; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index db773428f95..bc408976973 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -177,105 +177,149 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); * pNFS client layout cache */ +/* Need to hold i_lock if caller does not already hold reference */ +void +get_layout_hdr(struct pnfs_layout_hdr *lo) +{ + atomic_inc(&lo->plh_refcount); +} + static void -get_layout_hdr_locked(struct pnfs_layout_hdr *lo) +destroy_layout_hdr(struct pnfs_layout_hdr *lo) { - assert_spin_locked(&lo->inode->i_lock); - lo->refcount++; + dprintk("%s: freeing layout cache %p\n", __func__, lo); + BUG_ON(!list_empty(&lo->plh_layouts)); + NFS_I(lo->plh_inode)->layout = NULL; + kfree(lo); } static void put_layout_hdr_locked(struct pnfs_layout_hdr *lo) { - assert_spin_locked(&lo->inode->i_lock); - BUG_ON(lo->refcount == 0); - - lo->refcount--; - if (!lo->refcount) { - dprintk("%s: freeing layout cache %p\n", __func__, lo); - BUG_ON(!list_empty(&lo->layouts)); - NFS_I(lo->inode)->layout = NULL; - kfree(lo); - } + if (atomic_dec_and_test(&lo->plh_refcount)) + destroy_layout_hdr(lo); } void -put_layout_hdr(struct inode *inode) +put_layout_hdr(struct pnfs_layout_hdr *lo) { - spin_lock(&inode->i_lock); - put_layout_hdr_locked(NFS_I(inode)->layout); - spin_unlock(&inode->i_lock); + struct inode *inode = lo->plh_inode; + + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + destroy_layout_hdr(lo); + spin_unlock(&inode->i_lock); + } } static void init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { - INIT_LIST_HEAD(&lseg->fi_list); - kref_init(&lseg->kref); - lseg->layout = lo; + INIT_LIST_HEAD(&lseg->pls_list); + atomic_set(&lseg->pls_refcount, 1); + smp_mb(); + set_bit(NFS_LSEG_VALID, &lseg->pls_flags); + lseg->pls_layout = lo; } -/* Called without i_lock held, as the free_lseg call may sleep */ -static void -destroy_lseg(struct kref *kref) +static void free_lseg(struct pnfs_layout_segment *lseg) { - struct pnfs_layout_segment *lseg = - container_of(kref, struct pnfs_layout_segment, kref); - struct inode *ino = lseg->layout->inode; + struct inode *ino = lseg->pls_layout->plh_inode; - dprintk("--> %s\n", __func__); NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ - put_layout_hdr(ino); + /* Matched by get_layout_hdr in pnfs_insert_layout */ + put_layout_hdr(NFS_I(ino)->layout); } -static void -put_lseg(struct pnfs_layout_segment *lseg) +/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg + * could sleep, so must be called outside of the lock. + * Returns 1 if object was removed, otherwise return 0. + */ +static int +put_lseg_locked(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + struct inode *ino = lseg->pls_layout->plh_inode; + + BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del(&lseg->pls_list); + if (list_empty(&lseg->pls_layout->plh_segs)) { + struct nfs_client *clp; + + clp = NFS_SERVER(ino)->nfs_client; + spin_lock(&clp->cl_lock); + /* List does not take a reference, so no need for put here */ + list_del_init(&lseg->pls_layout->plh_layouts); + spin_unlock(&clp->cl_lock); + clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); + } + rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); + list_add(&lseg->pls_list, tmp_list); + return 1; + } + return 0; +} + +static bool +should_free_lseg(u32 lseg_iomode, u32 recall_iomode) { - if (!lseg) - return; + return (recall_iomode == IOMODE_ANY || + lseg_iomode == recall_iomode); +} - dprintk("%s: lseg %p ref %d\n", __func__, lseg, - atomic_read(&lseg->kref.refcount)); - kref_put(&lseg->kref, destroy_lseg); +/* Returns 1 if lseg is removed from list, 0 otherwise */ +static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + int rv = 0; + + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + /* Remove the reference keeping the lseg in the + * list. It will now be removed when all + * outstanding io is finished. + */ + rv = put_lseg_locked(lseg, tmp_list); + } + return rv; } -static void -pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) +/* Returns count of number of matching invalid lsegs remaining in list + * after call. + */ +int +mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + u32 iomode) { struct pnfs_layout_segment *lseg, *next; - struct nfs_client *clp; + int invalid = 0, removed = 0; dprintk("%s:Begin lo %p\n", __func__, lo); - assert_spin_locked(&lo->inode->i_lock); - list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { - dprintk("%s: freeing lseg %p\n", __func__, lseg); - list_move(&lseg->fi_list, tmp_list); - } - clp = NFS_SERVER(lo->inode)->nfs_client; - spin_lock(&clp->cl_lock); - /* List does not take a reference, so no need for put here */ - list_del_init(&lo->layouts); - spin_unlock(&clp->cl_lock); - write_seqlock(&lo->seqlock); - clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state); - write_sequnlock(&lo->seqlock); - - dprintk("%s:Return\n", __func__); + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (should_free_lseg(lseg->pls_range.iomode, iomode)) { + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, lseg->pls_range.offset, + lseg->pls_range.length); + invalid++; + removed += mark_lseg_invalid(lseg, tmp_list); + } + dprintk("%s:Return %i\n", __func__, invalid - removed); + return invalid - removed; } -static void -pnfs_free_lseg_list(struct list_head *tmp_list) +void +pnfs_free_lseg_list(struct list_head *free_me) { - struct pnfs_layout_segment *lseg; + struct pnfs_layout_segment *lseg, *tmp; - while (!list_empty(tmp_list)) { - lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, - fi_list); - dprintk("%s calling put_lseg on %p\n", __func__, lseg); - list_del(&lseg->fi_list); - put_lseg(lseg); + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { + list_del(&lseg->pls_list); + free_lseg(lseg); } } @@ -288,7 +332,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - pnfs_clear_lseg_list(lo, &tmp_list); + set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); + mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); /* Matched by refcount set to 1 in alloc_init_layout_hdr */ put_layout_hdr_locked(lo); } @@ -312,76 +357,80 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) while (!list_empty(&tmp_list)) { lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, - layouts); + plh_layouts); dprintk("%s freeing layout for inode %lu\n", __func__, - lo->inode->i_ino); - pnfs_destroy_layout(NFS_I(lo->inode)); + lo->plh_inode->i_ino); + pnfs_destroy_layout(NFS_I(lo->plh_inode)); } } -/* update lo->stateid with new if is more recent - * - * lo->stateid could be the open stateid, in which case we just use what given. - */ -static void -pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new) -{ - nfs4_stateid *old = &lo->stateid; - bool overwrite = false; - - write_seqlock(&lo->seqlock); - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || - memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) - overwrite = true; - else { - u32 oldseq, newseq; - - oldseq = be32_to_cpu(old->stateid.seqid); - newseq = be32_to_cpu(new->stateid.seqid); - if ((int)(newseq - oldseq) > 0) - overwrite = true; +/* update lo->plh_stateid with new if is more recent */ +void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + bool update_barrier) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) { + memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); + if (update_barrier) { + u32 new_barrier = be32_to_cpu(new->stateid.seqid); + + if ((int)(new_barrier - lo->plh_barrier)) + lo->plh_barrier = new_barrier; + } else { + /* Because of wraparound, we want to keep the barrier + * "close" to the current seqids. It needs to be + * within 2**31 to count as "behind", so if it + * gets too near that limit, give us a litle leeway + * and bring it to within 2**30. + * NOTE - and yes, this is all unsigned arithmetic. + */ + if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) + lo->plh_barrier = newseq - (1 << 30); + } } - if (overwrite) - memcpy(&old->stateid, &new->stateid, sizeof(new->stateid)); - write_sequnlock(&lo->seqlock); } -static void -pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, - struct nfs4_state *state) +/* lget is set to 1 if called from inside send_layoutget call chain */ +static bool +pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, + int lget) { - int seq; - - dprintk("--> %s\n", __func__); - write_seqlock(&lo->seqlock); - do { - seq = read_seqbegin(&state->seqlock); - memcpy(lo->stateid.data, state->stateid.data, - sizeof(state->stateid.data)); - } while (read_seqretry(&state->seqlock, seq)); - set_bit(NFS_LAYOUT_STATEID_SET, &lo->state); - write_sequnlock(&lo->seqlock); - dprintk("<-- %s\n", __func__); + if ((stateid) && + (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) + return true; + return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + (list_empty(&lo->plh_segs) && + (atomic_read(&lo->plh_outstanding) > lget)); } -void -pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct nfs4_state *open_state) +int +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) { - int seq; + int status = 0; dprintk("--> %s\n", __func__); - do { - seq = read_seqbegin(&lo->seqlock); - if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { - /* This will trigger retry of the read */ - pnfs_layout_from_open_stateid(lo, open_state); - } else - memcpy(dst->data, lo->stateid.data, - sizeof(lo->stateid.data)); - } while (read_seqretry(&lo->seqlock, seq)); + spin_lock(&lo->plh_inode->i_lock); + if (pnfs_layoutgets_blocked(lo, NULL, 1)) { + status = -EAGAIN; + } else if (list_empty(&lo->plh_segs)) { + int seq; + + do { + seq = read_seqbegin(&open_state->seqlock); + memcpy(dst->data, open_state->stateid.data, + sizeof(open_state->stateid.data)); + } while (read_seqretry(&open_state->seqlock, seq)); + } else + memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); + spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); + return status; } /* @@ -395,7 +444,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, u32 iomode) { - struct inode *ino = lo->inode; + struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg = NULL; @@ -404,10 +453,8 @@ send_layoutget(struct pnfs_layout_hdr *lo, BUG_ON(ctx == NULL); lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); - if (lgp == NULL) { - put_layout_hdr(lo->inode); + if (lgp == NULL) return NULL; - } lgp->args.minlength = NFS4_MAX_UINT64; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range.iomode = iomode; @@ -424,11 +471,88 @@ send_layoutget(struct pnfs_layout_hdr *lo, nfs4_proc_layoutget(lgp); if (!lseg) { /* remember that LAYOUTGET failed and suspend trying */ - set_bit(lo_fail_bit(iomode), &lo->state); + set_bit(lo_fail_bit(iomode), &lo->plh_flags); } return lseg; } +bool pnfs_roc(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg, *tmp; + LIST_HEAD(tmp_list); + bool found = false; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + goto out_nolayout; + list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + mark_lseg_invalid(lseg, &tmp_list); + found = true; + } + if (!found) + goto out_nolayout; + lo->plh_block_lgets++; + get_layout_hdr(lo); /* matched in pnfs_roc_release */ + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + return true; + +out_nolayout: + spin_unlock(&ino->i_lock); + return false; +} + +void pnfs_roc_release(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + lo->plh_block_lgets--; + put_layout_hdr_locked(lo); + spin_unlock(&ino->i_lock); +} + +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if ((int)(barrier - lo->plh_barrier) > 0) + lo->plh_barrier = barrier; + spin_unlock(&ino->i_lock); +} + +bool pnfs_roc_drain(struct inode *ino, u32 *barrier) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_segment *lseg; + bool found = false; + + spin_lock(&ino->i_lock); + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + found = true; + break; + } + if (!found) { + struct pnfs_layout_hdr *lo = nfsi->layout; + u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); + + /* Since close does not return a layout stateid for use as + * a barrier, we choose the worst-case barrier. + */ + *barrier = current_seqid + atomic_read(&lo->plh_outstanding); + } + spin_unlock(&ino->i_lock); + return found; +} + /* * Compare two layout segments for sorting into layout cache. * We want to preferentially return RW over RO layouts, so ensure those @@ -450,37 +574,29 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, dprintk("%s:Begin\n", __func__); - assert_spin_locked(&lo->inode->i_lock); - if (list_empty(&lo->segs)) { - struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; - - spin_lock(&clp->cl_lock); - BUG_ON(!list_empty(&lo->layouts)); - list_add_tail(&lo->layouts, &clp->cl_layouts); - spin_unlock(&clp->cl_lock); - } - list_for_each_entry(lp, &lo->segs, fi_list) { - if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0) + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry(lp, &lo->plh_segs, pls_list) { + if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) continue; - list_add_tail(&lseg->fi_list, &lp->fi_list); + list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " "iomode %d offset %llu length %llu before " "lp %p iomode %d offset %llu length %llu\n", - __func__, lseg, lseg->range.iomode, - lseg->range.offset, lseg->range.length, - lp, lp->range.iomode, lp->range.offset, - lp->range.length); + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length, + lp, lp->pls_range.iomode, lp->pls_range.offset, + lp->pls_range.length); found = 1; break; } if (!found) { - list_add_tail(&lseg->fi_list, &lo->segs); + list_add_tail(&lseg->pls_list, &lo->plh_segs); dprintk("%s: inserted lseg %p " "iomode %d offset %llu length %llu at tail\n", - __func__, lseg, lseg->range.iomode, - lseg->range.offset, lseg->range.length); + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length); } - get_layout_hdr_locked(lo); + get_layout_hdr(lo); dprintk("%s:Return\n", __func__); } @@ -493,11 +609,11 @@ alloc_init_layout_hdr(struct inode *ino) lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); if (!lo) return NULL; - lo->refcount = 1; - INIT_LIST_HEAD(&lo->layouts); - INIT_LIST_HEAD(&lo->segs); - seqlock_init(&lo->seqlock); - lo->inode = ino; + atomic_set(&lo->plh_refcount, 1); + INIT_LIST_HEAD(&lo->plh_layouts); + INIT_LIST_HEAD(&lo->plh_segs); + INIT_LIST_HEAD(&lo->plh_bulk_recall); + lo->plh_inode = ino; return lo; } @@ -510,9 +626,12 @@ pnfs_find_alloc_layout(struct inode *ino) dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); assert_spin_locked(&ino->i_lock); - if (nfsi->layout) - return nfsi->layout; - + if (nfsi->layout) { + if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) + return NULL; + else + return nfsi->layout; + } spin_unlock(&ino->i_lock); new = alloc_init_layout_hdr(ino); spin_lock(&ino->i_lock); @@ -538,31 +657,32 @@ pnfs_find_alloc_layout(struct inode *ino) static int is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) { - return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); + return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); } /* * lookup range in layout */ static struct pnfs_layout_segment * -pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) +pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) { struct pnfs_layout_segment *lseg, *ret = NULL; dprintk("%s:Begin\n", __func__); - assert_spin_locked(&lo->inode->i_lock); - list_for_each_entry(lseg, &lo->segs, fi_list) { - if (is_matching_lseg(lseg, iomode)) { + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && + is_matching_lseg(lseg, iomode)) { ret = lseg; break; } - if (cmp_layout(iomode, lseg->range.iomode) > 0) + if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) break; } dprintk("%s:Return lseg %p ref %d\n", - __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); + __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); return ret; } @@ -576,6 +696,7 @@ pnfs_update_layout(struct inode *ino, enum pnfs_iomode iomode) { struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; @@ -588,25 +709,53 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; } - /* Check to see if the layout for the given range already exists */ - lseg = pnfs_has_layout(lo, iomode); - if (lseg) { - dprintk("%s: Using cached lseg %p for iomode %d)\n", - __func__, lseg, iomode); + /* Do we even need to bother with this? */ + if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_find_lseg(lo, iomode); + if (lseg) + goto out_unlock; /* if LAYOUTGET already failed once we don't try again */ - if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) + goto out_unlock; + + if (pnfs_layoutgets_blocked(lo, NULL, 0)) goto out_unlock; + atomic_inc(&lo->plh_outstanding); - get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ + get_layout_hdr(lo); + if (list_empty(&lo->plh_segs)) { + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); + BUG_ON(!list_empty(&lo->plh_layouts)); + list_add_tail(&lo->plh_layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } spin_unlock(&ino->i_lock); lseg = send_layoutget(lo, ctx, iomode); + if (!lseg) { + spin_lock(&ino->i_lock); + if (list_empty(&lo->plh_segs)) { + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + } + spin_unlock(&ino->i_lock); + } + atomic_dec(&lo->plh_outstanding); + put_layout_hdr(lo); out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, - nfsi->layout->state, lseg); + nfsi->layout->plh_flags, lseg); return lseg; out_unlock: spin_unlock(&ino->i_lock); @@ -619,9 +768,21 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; - struct inode *ino = lo->inode; + struct inode *ino = lo->plh_inode; + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; int status = 0; + /* Verify we got what we asked for. + * Note that because the xdr parsing only accepts a single + * element array, this can fail even if the server is behaving + * correctly. + */ + if (lgp->args.range.iomode > res->range.iomode || + res->range.offset != 0 || + res->range.length != NFS4_MAX_UINT64) { + status = -EINVAL; + goto out; + } /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); if (!lseg || IS_ERR(lseg)) { @@ -635,16 +796,37 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } spin_lock(&ino->i_lock); + if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s forget reply due to recall\n", __func__); + goto out_forget_reply; + } + + if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { + dprintk("%s forget reply due to state\n", __func__); + goto out_forget_reply; + } init_lseg(lo, lseg); - lseg->range = res->range; + lseg->pls_range = res->range; *lgp->lsegpp = lseg; pnfs_insert_layout(lo, lseg); + if (res->return_on_close) { + set_bit(NFS_LSEG_ROC, &lseg->pls_flags); + set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); + } + /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid); + pnfs_set_layout_stateid(lo, &res->stateid, false); spin_unlock(&ino->i_lock); out: return status; + +out_forget_reply: + spin_unlock(&ino->i_lock); + lseg->pls_layout = lo; + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + goto out; } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e12367d5048..e2612ea0cbe 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,11 +30,17 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +enum { + NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ + NFS_LSEG_ROC, /* roc bit received from server */ +}; + struct pnfs_layout_segment { - struct list_head fi_list; - struct pnfs_layout_range range; - struct kref kref; - struct pnfs_layout_hdr *layout; + struct list_head pls_list; + struct pnfs_layout_range pls_range; + atomic_t pls_refcount; + unsigned long pls_flags; + struct pnfs_layout_hdr *pls_layout; }; #ifdef CONFIG_NFS_V4_1 @@ -44,7 +50,9 @@ struct pnfs_layout_segment { enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ - NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ + NFS_LAYOUT_ROC, /* some lseg had roc bit set */ + NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ }; /* Per-layout driver specific registration structure */ @@ -60,13 +68,16 @@ struct pnfs_layoutdriver_type { }; struct pnfs_layout_hdr { - unsigned long refcount; - struct list_head layouts; /* other client layouts */ - struct list_head segs; /* layout segments list */ - seqlock_t seqlock; /* Protects the stateid */ - nfs4_stateid stateid; - unsigned long state; - struct inode *inode; + atomic_t plh_refcount; + struct list_head plh_layouts; /* other client layouts */ + struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ + struct list_head plh_segs; /* layout segments list */ + nfs4_stateid plh_stateid; + atomic_t plh_outstanding; /* number of RPCs out */ + unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ + u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_flags; + struct inode *plh_inode; }; struct pnfs_device { @@ -134,17 +145,30 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); /* pnfs.c */ +void get_layout_hdr(struct pnfs_layout_hdr *lo); struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); int pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); -void put_layout_hdr(struct inode *inode); -void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct nfs4_state *open_state); +void put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + bool update_barrier); +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, + struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); +int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + u32 iomode); +bool pnfs_roc(struct inode *ino); +void pnfs_roc_release(struct inode *ino); +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier); static inline int lo_fail_bit(u32 iomode) @@ -176,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, return NULL; } +static inline bool +pnfs_roc(struct inode *ino) +{ + return false; +} + +static inline void +pnfs_roc_release(struct inode *ino) +{ +} + +static inline void +pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ +} + +static inline bool +pnfs_roc_drain(struct inode *ino, u32 *barrier) +{ + return false; +} + static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) { } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 58e7f84fc1f..77d5e21c4ad 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -458,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, fattr = nfs_alloc_fattr(); status = -ENOMEM; if (fh == NULL || fattr == NULL) - goto out; + goto out_free; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -471,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, if (status == 0) status = nfs_instantiate(dentry, fh, fattr); +out_free: nfs_free_fattr(fattr); nfs_free_fhandle(fh); out: @@ -731,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .statfs = nfs_proc_statfs, .fsinfo = nfs_proc_fsinfo, .pathconf = nfs_proc_pathconf, - .decode_dirent = nfs_decode_dirent, + .decode_dirent = nfs2_decode_dirent, .read_setup = nfs_proc_read_setup, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4100630c9a5..b68c8607770 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -598,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->mountd_version || showdefaults) seq_printf(m, ",mountvers=%u", nfss->mountd_version); - if (nfss->mountd_port || showdefaults) + if ((nfss->mountd_port && + nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) || + showdefaults) seq_printf(m, ",mountport=%u", nfss->mountd_port); nfs_show_mountd_netid(m, nfss, showdefaults); @@ -2200,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data) s->s_flags = sb_mntdata->mntflags; s->s_fs_info = server; + s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; ret = set_anon_super(s, server); if (ret == 0) server->s_dev = s->s_dev; @@ -2494,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb, sb->s_maxbytes = old_sb->s_maxbytes; sb->s_time_gran = 1; sb->s_op = old_sb->s_op; - nfs_initialise_sb(sb); + /* + * The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_xattr = old_sb->s_xattr; + nfs_initialise_sb(sb); } /* @@ -2504,6 +2513,12 @@ static void nfs4_fill_super(struct super_block *sb) { sb->s_time_gran = 1; sb->s_op = &nfs4_sops; + /* + * The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_xattr = nfs4_xattr_handlers; nfs_initialise_sb(sb); } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 7bdec853140..e313a51acdd 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -429,7 +429,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); - task_setup_data.callback_data = data, + task_setup_data.callback_data = data; data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { @@ -496,7 +496,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - atomic_read(&dentry->d_count)); + dentry->d_count); nfs_inc_stats(dir, NFSIOS_SILLYRENAME); /* diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h new file mode 100644 index 00000000000..34e5c40af5e --- /dev/null +++ b/fs/nfsd/acl.h @@ -0,0 +1,59 @@ +/* + * Common NFSv4 ACL handling definitions. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFS4_ACL_H +#define LINUX_NFS4_ACL_H + +#include <linux/posix_acl.h> + +/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to + * fit in a page: */ +#define NFS4_ACL_MAX 170 + +struct nfs4_acl *nfs4_acl_new(int); +int nfs4_acl_get_whotype(char *, u32); +int nfs4_acl_write_who(int who, char *p); +int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, + uid_t who, u32 mask); + +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 + +struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, + struct posix_acl *, unsigned int flags); +int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, + struct posix_acl **, unsigned int flags); + +#endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c0fcb7ab7f6..8b31e5f8795 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1,4 +1,3 @@ -#define MSNFS /* HACK HACK */ /* * NFS exporting and validation. * @@ -1444,9 +1443,6 @@ static struct flags { { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, -#ifdef MSNFS - { NFSEXP_MSNFS, {"msnfs", ""}}, -#endif { 0, {"", ""}} }; diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h new file mode 100644 index 00000000000..2f3be132153 --- /dev/null +++ b/fs/nfsd/idmap.h @@ -0,0 +1,62 @@ +/* + * Mapping of UID to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. +> * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFSD_IDMAP_H +#define LINUX_NFSD_IDMAP_H + +#include <linux/in.h> +#include <linux/sunrpc/svc.h> + +/* XXX from linux/nfs_idmap.h */ +#define IDMAP_NAMESZ 128 + +#ifdef CONFIG_NFSD_V4 +int nfsd_idmap_init(void); +void nfsd_idmap_shutdown(void); +#else +static inline int nfsd_idmap_init(void) +{ + return 0; +} +static inline void nfsd_idmap_shutdown(void) +{ +} +#endif + +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); +int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); +int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); + +#endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 5b7e3021e06..2247fc91d5e 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, __be32 nfserr; u32 max_blocksize = svc_max_payload(rqstp); - dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", + dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, - (unsigned long) argp->offset); + (unsigned long long) argp->offset); /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) @@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, __be32 nfserr; unsigned long cnt = argp->len; - dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", + dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), argp->len, - (unsigned long) argp->offset, + (unsigned long long) argp->offset, argp->stable? " stable" : ""); fh_copy(&resp->fh, &argp->fh); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index e4805261515..ad88f1c0a4c 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,7 +36,7 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> -#include <linux/nfs4_acl.h> +#include "acl.h" /* mode bit translations: */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 143da2eecd7..3be975e1891 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -50,11 +50,6 @@ enum { NFSPROC4_CLNT_CB_SEQUENCE, }; -enum nfs_cb_opnum4 { - OP_CB_RECALL = 4, - OP_CB_SEQUENCE = 11, -}; - #define NFS4_MAXTAGLEN 20 #define NFS4_enc_cb_null_sz 0 @@ -79,61 +74,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) -/* -* Generic encode routines from fs/nfs/nfs4xdr.c -*/ -static inline __be32 * -xdr_writemem(__be32 *p, const void *ptr, int nbytes) -{ - int tmp = XDR_QUADLEN(nbytes); - if (!tmp) - return p; - p[tmp-1] = 0; - memcpy(p, ptr, nbytes); - return p + tmp; -} - -#define WRITE32(n) *p++ = htonl(n) -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_writemem(p, ptr, nbytes); \ -} while (0) -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \ - BUG_ON(!p); \ -} while (0) - -/* - * Generic decode routines from fs/nfs/nfs4xdr.c - */ -#define DECODE_TAIL \ - status = 0; \ -out: \ - return status; \ -xdr_error: \ - dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \ - status = -EIO; \ - goto out - -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (!p) { \ - dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \ - __func__, __LINE__); \ - return -EIO; \ - } \ -} while (0) - struct nfs4_cb_compound_hdr { /* args */ u32 ident; /* minorversion 0 only */ @@ -144,295 +84,513 @@ struct nfs4_cb_compound_hdr { int status; }; -static struct { -int stat; -int errno; -} nfs_cb_errtbl[] = { - { NFS4_OK, 0 }, - { NFS4ERR_PERM, EPERM }, - { NFS4ERR_NOENT, ENOENT }, - { NFS4ERR_IO, EIO }, - { NFS4ERR_NXIO, ENXIO }, - { NFS4ERR_ACCESS, EACCES }, - { NFS4ERR_EXIST, EEXIST }, - { NFS4ERR_XDEV, EXDEV }, - { NFS4ERR_NOTDIR, ENOTDIR }, - { NFS4ERR_ISDIR, EISDIR }, - { NFS4ERR_INVAL, EINVAL }, - { NFS4ERR_FBIG, EFBIG }, - { NFS4ERR_NOSPC, ENOSPC }, - { NFS4ERR_ROFS, EROFS }, - { NFS4ERR_MLINK, EMLINK }, - { NFS4ERR_NAMETOOLONG, ENAMETOOLONG }, - { NFS4ERR_NOTEMPTY, ENOTEMPTY }, - { NFS4ERR_DQUOT, EDQUOT }, - { NFS4ERR_STALE, ESTALE }, - { NFS4ERR_BADHANDLE, EBADHANDLE }, - { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, - { NFS4ERR_NOTSUPP, ENOTSUPP }, - { NFS4ERR_TOOSMALL, ETOOSMALL }, - { NFS4ERR_SERVERFAULT, ESERVERFAULT }, - { NFS4ERR_BADTYPE, EBADTYPE }, - { NFS4ERR_LOCKED, EAGAIN }, - { NFS4ERR_RESOURCE, EREMOTEIO }, - { NFS4ERR_SYMLINK, ELOOP }, - { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, - { NFS4ERR_DEADLOCK, EDEADLK }, - { -1, EIO } -}; +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} -static int -nfs_cb_stat_to_errno(int stat) +static __be32 *xdr_encode_empty_array(__be32 *p) { - int i; - for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { - if (nfs_cb_errtbl[i].stat == stat) - return nfs_cb_errtbl[i].errno; - } - /* If we cannot translate the error, the recovery routines should - * handle it. - * Note: remaining NFSv4 error codes have values > 10000, so should - * not conflict with native Linux error codes. - */ - return stat; + *p++ = xdr_zero; + return p; } /* - * XDR encode + * Encode/decode NFSv4 CB basic data types + * + * Basic NFSv4 callback data types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section + * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version + * 1 Protocol" */ -static void -encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +/* + * nfs_cb_opnum4 + * + * enum nfs_cb_opnum4 { + * OP_CB_GETATTR = 3, + * ... + * }; + */ +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_ILLEGAL = 10044 +}; + +static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) { __be32 *p; - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(sid->si_generation); - WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(op); } -static void -encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) +/* + * nfs_fh4 + * + * typedef opaque nfs_fh4<NFS4_FHSIZE>; + */ +static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh) { - __be32 * p; + u32 length = fh->fh_size; + __be32 *p; - RESERVE_SPACE(16); - WRITE32(0); /* tag length is always 0 */ - WRITE32(hdr->minorversion); - WRITE32(hdr->ident); - hdr->nops_p = p; - WRITE32(hdr->nops); + BUG_ON(length > NFS4_FHSIZE); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, &fh->fh_base, length); } -static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) +/* + * stateid4 + * + * struct stateid4 { + * uint32_t seqid; + * opaque other[12]; + * }; + */ +static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) { - *hdr->nops_p = htonl(hdr->nops); + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(sid->si_generation); + xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE); } -static void -encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, - struct nfs4_cb_compound_hdr *hdr) +/* + * sessionid4 + * + * typedef opaque sessionid4[NFS4_SESSIONID_SIZE]; + */ +static void encode_sessionid4(struct xdr_stream *xdr, + const struct nfsd4_session *session) { __be32 *p; - int len = dp->dl_fh.fh_size; - - RESERVE_SPACE(4); - WRITE32(OP_CB_RECALL); - encode_stateid(xdr, &dp->dl_stateid); - RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); - WRITE32(0); /* truncate optimization not implemented */ - WRITE32(len); - WRITEMEM(&dp->dl_fh.fh_base, len); - hdr->nops++; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); } -static void -encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, - struct nfs4_cb_compound_hdr *hdr) -{ - __be32 *p; - struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; +/* + * nfsstat4 + */ +static const struct { + int stat; + int errno; +} nfs_cb_errtbl[] = { + { NFS4_OK, 0 }, + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -EIO }, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_RESOURCE, -EREMOTEIO }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } +}; - if (hdr->minorversion == 0) - return; +/* + * If we cannot translate the error, the recovery routines should + * handle it. + * + * Note: remaining NFSv4 error codes have values > 10000, so should + * not conflict with native Linux error codes. + */ +static int nfs_cb_stat_to_errno(int status) +{ + int i; - RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); + for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { + if (nfs_cb_errtbl[i].stat == status) + return nfs_cb_errtbl[i].errno; + } - WRITE32(OP_CB_SEQUENCE); - WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(ses->se_cb_seq_nr); - WRITE32(0); /* slotid, always 0 */ - WRITE32(0); /* highest slotid always 0 */ - WRITE32(0); /* cachethis always 0 */ - WRITE32(0); /* FIXME: support referring_call_lists */ - hdr->nops++; + dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status); + return -status; } -static int -nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) +static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, + enum nfsstat4 *status) { - struct xdr_stream xdrs, *xdr = &xdrs; + __be32 *p; + u32 op; - xdr_init_encode(&xdrs, &req->rq_snd_buf, p); - RESERVE_SPACE(0); + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + op = be32_to_cpup(p++); + if (unlikely(op != expected)) + goto out_unexpected; + *status = be32_to_cpup(p); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_unexpected: + dprintk("NFSD: Callback server returned operation %d but " + "we issued a request for %d\n", op, expected); + return -EIO; } -static int -nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, - struct nfsd4_callback *cb) +/* + * CB_COMPOUND4args + * + * struct CB_COMPOUND4args { + * utf8str_cs tag; + * uint32_t minorversion; + * uint32_t callback_ident; + * nfs_cb_argop4 argarray<>; + * }; +*/ +static void encode_cb_compound4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) { - struct xdr_stream xdr; - struct nfs4_delegation *args = cb->cb_op; - struct nfs4_cb_compound_hdr hdr = { - .ident = cb->cb_clp->cl_cb_ident, - .minorversion = cb->cb_minorversion, - }; + __be32 * p; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_cb_compound_hdr(&xdr, &hdr); - encode_cb_sequence(&xdr, cb, &hdr); - encode_cb_recall(&xdr, args, &hdr); - encode_cb_nops(&hdr); + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + p = xdr_encode_empty_array(p); /* empty tag */ + *p++ = cpu_to_be32(hdr->minorversion); + *p++ = cpu_to_be32(hdr->ident); + + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); /* argarray element count */ +} + +/* + * Update argarray element count + */ +static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) +{ + BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS); + *hdr->nops_p = cpu_to_be32(hdr->nops); +} + +/* + * CB_COMPOUND4res + * + * struct CB_COMPOUND4res { + * nfsstat4 status; + * utf8str_cs tag; + * nfs_cb_resop4 resarray<>; + * }; + */ +static int decode_cb_compound4res(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + /* Ignore the tag */ + length = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, length + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->nops = be32_to_cpup(p); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } +/* + * CB_RECALL4args + * + * struct CB_RECALL4args { + * stateid4 stateid; + * bool truncate; + * nfs_fh4 fh; + * }; + */ +static void encode_cb_recall4args(struct xdr_stream *xdr, + const struct nfs4_delegation *dp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); + encode_stateid4(xdr, &dp->dl_stateid); + + p = xdr_reserve_space(xdr, 4); + *p++ = xdr_zero; /* truncate */ -static int -decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ - __be32 *p; - u32 taglen; + encode_nfs_fh4(xdr, &dp->dl_fh); - READ_BUF(8); - READ32(hdr->status); - /* We've got no use for the tag; ignore it: */ - READ32(taglen); - READ_BUF(taglen + 4); - p += XDR_QUADLEN(taglen); - READ32(hdr->nops); - return 0; + hdr->nops++; } -static int -decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +/* + * CB_SEQUENCE4args + * + * struct CB_SEQUENCE4args { + * sessionid4 csa_sessionid; + * sequenceid4 csa_sequenceid; + * slotid4 csa_slotid; + * slotid4 csa_highest_slotid; + * bool csa_cachethis; + * referring_call_list4 csa_referring_call_lists<>; + * }; + */ +static void encode_cb_sequence4args(struct xdr_stream *xdr, + const struct nfsd4_callback *cb, + struct nfs4_cb_compound_hdr *hdr) { + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; __be32 *p; - u32 op; - int32_t nfserr; - - READ_BUF(8); - READ32(op); - if (op != expected) { - dprintk("NFSD: decode_cb_op_hdr: Callback server returned " - " operation %d but we issued a request for %d\n", - op, expected); - return -EIO; - } - READ32(nfserr); - if (nfserr != NFS_OK) - return -nfs_cb_stat_to_errno(nfserr); - return 0; + + if (hdr->minorversion == 0) + return; + + encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); + encode_sessionid4(xdr, session); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */ + *p++ = xdr_zero; /* csa_slotid */ + *p++ = xdr_zero; /* csa_highest_slotid */ + *p++ = xdr_zero; /* csa_cachethis */ + xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + hdr->nops++; } /* + * CB_SEQUENCE4resok + * + * struct CB_SEQUENCE4resok { + * sessionid4 csr_sessionid; + * sequenceid4 csr_sequenceid; + * slotid4 csr_slotid; + * slotid4 csr_highest_slotid; + * slotid4 csr_target_highest_slotid; + * }; + * + * union CB_SEQUENCE4res switch (nfsstat4 csr_status) { + * case NFS4_OK: + * CB_SEQUENCE4resok csr_resok4; + * default: + * void; + * }; + * * Our current back channel implmentation supports a single backchannel * with a single slot. */ -static int -decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb, - struct rpc_rqst *rqstp) +static int decode_cb_sequence4resok(struct xdr_stream *xdr, + struct nfsd4_callback *cb) { - struct nfsd4_session *ses = cb->cb_clp->cl_cb_session; + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; struct nfs4_sessionid id; int status; - u32 dummy; __be32 *p; + u32 dummy; - if (cb->cb_minorversion == 0) - return 0; - - status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); - if (status) - return status; + status = -ESERVERFAULT; /* * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - status = -ESERVERFAULT; - - READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); - p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); - if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { - dprintk("%s Invalid session id\n", __func__); + if (memcmp(id.data, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) { + dprintk("NFS: %s Invalid session id\n", __func__); goto out; } - READ32(dummy); - if (dummy != ses->se_cb_seq_nr) { - dprintk("%s Invalid sequence number\n", __func__); + p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); + + dummy = be32_to_cpup(p++); + if (dummy != session->se_cb_seq_nr) { + dprintk("NFS: %s Invalid sequence number\n", __func__); goto out; } - READ32(dummy); /* slotid must be 0 */ + + dummy = be32_to_cpup(p++); if (dummy != 0) { - dprintk("%s Invalid slotid\n", __func__); + dprintk("NFS: %s Invalid slotid\n", __func__); goto out; } - /* FIXME: process highest slotid and target highest slotid */ + + /* + * FIXME: process highest slotid and target highest slotid + */ status = 0; out: return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } +static int decode_cb_sequence4res(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + enum nfsstat4 nfserr; + int status; + + if (cb->cb_minorversion == 0) + return 0; -static int -nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr); + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) + goto out_default; + status = decode_cb_sequence4resok(xdr, cb); +out: + return status; +out_default: + return nfs_cb_stat_to_errno(status); +} + +/* + * NFSv4.0 and NFSv4.1 XDR encode functions + * + * NFSv4.0 callback argument types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +/* + * NB: Without this zero space reservation, callbacks over krb5p fail + */ +static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + xdr_reserve_space(xdr, 0); +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfs4_delegation *args = cb->cb_op; + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recall4args(xdr, args, &hdr); + encode_cb_nops(&hdr); +} + + +/* + * NFSv4.0 and NFSv4.1 XDR decode functions + * + * NFSv4.0 callback result types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) { return 0; } -static int -nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, - struct nfsd4_callback *cb) +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) { - struct xdr_stream xdr; struct nfs4_cb_compound_hdr hdr; + enum nfsstat4 nfserr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_cb_compound_hdr(&xdr, &hdr); - if (status) + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) goto out; - if (cb) { - status = decode_cb_sequence(&xdr, cb, rqstp); - if (status) + + if (cb != NULL) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status)) goto out; } - status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); + + status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr); + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) + goto out_default; out: return status; +out_default: + return nfs_cb_stat_to_errno(status); } /* * RPC procedure tables */ -#define PROC(proc, call, argtype, restype) \ -[NFSPROC4_CLNT_##proc] = { \ - .p_proc = NFSPROC4_CB_##call, \ - .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ - .p_arglen = NFS4_##argtype##_sz, \ - .p_replen = NFS4_##restype##_sz, \ - .p_statidx = NFSPROC4_CB_##call, \ - .p_name = #proc, \ -} - -static struct rpc_procinfo nfs4_cb_procedures[] = { - PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), - PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), +#define PROC(proc, call, argtype, restype) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_CB_##call, \ + .p_encode = (kxdreproc_t)nfs4_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_dec_##restype, \ + .p_arglen = NFS4_enc_##argtype##_sz, \ + .p_replen = NFS4_dec_##restype##_sz, \ + .p_statidx = NFSPROC4_CB_##call, \ + .p_name = #proc, \ +} + +static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, cb_null, cb_null), + PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), }; -static struct rpc_version nfs_cb_version4 = { +static struct rpc_version nfs_cb_version4 = { /* * Note on the callback rpc program version number: despite language in rfc * 5661 section 18.36.3 requiring servers to use 4 in this field, the @@ -440,29 +598,29 @@ static struct rpc_version nfs_cb_version4 = { * in practice that appears to be what implementations use. The section * 18.36.3 language is expected to be fixed in an erratum. */ - .number = 1, - .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), - .procs = nfs4_cb_procedures + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures }; -static struct rpc_version * nfs_cb_version[] = { +static struct rpc_version *nfs_cb_version[] = { &nfs_cb_version4, }; static struct rpc_program cb_program; static struct rpc_stat cb_stats = { - .program = &cb_program + .program = &cb_program }; #define NFS4_CALLBACK 0x40000000 static struct rpc_program cb_program = { - .name = "nfs4_cb", - .number = NFS4_CALLBACK, - .nrvers = ARRAY_SIZE(nfs_cb_version), - .version = nfs_cb_version, - .stats = &cb_stats, - .pipe_dir_name = "/nfsd4_cb", + .name = "nfs4_cb", + .number = NFS4_CALLBACK, + .nrvers = ARRAY_SIZE(nfs_cb_version), + .version = nfs_cb_version, + .stats = &cb_stats, + .pipe_dir_name = "/nfsd4_cb", }; static int max_cb_time(void) @@ -470,10 +628,8 @@ static int max_cb_time(void) return max(nfsd4_lease/10, (time_t)1) * HZ; } -/* Reference counting, callback cleanup, etc., all look racy as heck. - * And why is cl_cb_set an atomic? */ -int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { struct rpc_timeout timeparms = { .to_initval = max_cb_time(), @@ -483,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) .net = &init_net, .address = (struct sockaddr *) &conn->cb_addr, .addrsize = conn->cb_addrlen, + .saddress = (struct sockaddr *) &conn->cb_saddr, .timeout = &timeparms, .program = &cb_program, .version = 0, @@ -499,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) args.protocol = XPRT_TRANSPORT_TCP; clp->cl_cb_ident = conn->cb_ident; } else { + if (!conn->cb_xprt) + return -EINVAL; + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; + clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; args.protocol = XPRT_TRANSPORT_BC_TCP; @@ -521,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason) (int)clp->cl_name.len, clp->cl_name.data, reason); } +static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_DOWN; + warn_no_callback_path(clp, reason); +} + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) - warn_no_callback_path(clp, task->tk_status); + nfsd4_mark_cb_down(clp, task->tk_status); else - atomic_set(&clp->cl_cb_set, 1); + clp->cl_cb_state = NFSD4_CB_UP; } static const struct rpc_call_ops nfsd4_cb_probe_ops = { @@ -551,6 +718,11 @@ int set_callback_cred(void) static struct workqueue_struct *callback_wq; +static void run_nfsd4_cb(struct nfsd4_callback *cb) +{ + queue_work(callback_wq, &cb->cb_work); +} + static void do_probe_callback(struct nfs4_client *clp) { struct nfsd4_callback *cb = &clp->cl_cb_null; @@ -565,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp) cb->cb_ops = &nfsd4_cb_probe_ops; - queue_work(callback_wq, &cb->cb_work); + run_nfsd4_cb(cb); } /* @@ -574,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp) */ void nfsd4_probe_callback(struct nfs4_client *clp) { + /* XXX: atomicity? Also, should we be using cl_cb_flags? */ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); do_probe_callback(clp); } -void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +void nfsd4_probe_callback_sync(struct nfs4_client *clp) { - BUG_ON(atomic_read(&clp->cl_cb_set)); + nfsd4_probe_callback(clp); + flush_workqueue(callback_wq); +} +void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; spin_lock(&clp->cl_lock); memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); spin_unlock(&clp->cl_lock); @@ -592,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) * If the slot is available, then mark it busy. Otherwise, set the * thread for sleeping on the callback RPC wait queue. */ -static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, - struct rpc_task *task) +static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) { - u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data; - int status = 0; - - dprintk("%s: %u:%u:%u:%u\n", __func__, - ptr[0], ptr[1], ptr[2], ptr[3]); - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); dprintk("%s slot is busy\n", __func__); - status = -EAGAIN; - goto out; + return false; } -out: - dprintk("%s status=%d\n", __func__, status); - return status; + return true; } /* @@ -622,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); struct nfs4_client *clp = dp->dl_client; u32 minorversion = clp->cl_minorversion; - int status = 0; cb->cb_minorversion = minorversion; if (minorversion) { - status = nfsd41_cb_setup_sequence(clp, task); - if (status) { - if (status != -EAGAIN) { - /* terminate rpc task */ - task->tk_status = status; - task->tk_action = NULL; - } + if (!nfsd41_cb_get_slot(clp, task)) return; - } } + spin_lock(&clp->cl_lock); + if (list_empty(&cb->cb_per_client)) { + /* This is the first call, not a restart */ + cb->cb_done = false; + list_add(&cb->cb_per_client, &clp->cl_callbacks); + } + spin_unlock(&clp->cl_lock); rpc_call_start(task); } @@ -671,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) nfsd4_cb_done(task, calldata); - if (current_rpc_client == NULL) { - /* We're shutting down; give up. */ - /* XXX: err, or is it ok just to fall through - * and rpc_restart_call? */ + if (current_rpc_client != task->tk_client) { + /* We're shutting down or changing cl_cb_client; leave + * it to nfsd4_process_cb_update to restart the call if + * necessary. */ return; } + if (cb->cb_done) + return; switch (task->tk_status) { case 0: + cb->cb_done = true; return; case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: @@ -688,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) break; default: /* Network partition? */ - atomic_set(&clp->cl_cb_set, 0); - warn_no_callback_path(clp, task->tk_status); - if (current_rpc_client != task->tk_client) { - /* queue a callback on the new connection: */ - atomic_inc(&dp->dl_count); - nfsd4_cb_recall(dp); - return; - } + nfsd4_mark_cb_down(clp, task->tk_status); } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; rpc_restart_call_prepare(task); return; - } else { - atomic_set(&clp->cl_cb_set, 0); - warn_no_callback_path(clp, task->tk_status); } + nfsd4_mark_cb_down(clp, task->tk_status); + cb->cb_done = true; } static void nfsd4_cb_recall_release(void *calldata) { struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - nfs4_put_delegation(dp); + if (cb->cb_done) { + spin_lock(&clp->cl_lock); + list_del(&cb->cb_per_client); + spin_unlock(&clp->cl_lock); + nfs4_put_delegation(dp); + } } static const struct rpc_call_ops nfsd4_cb_recall_ops = { @@ -748,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) flush_workqueue(callback_wq); } -void nfsd4_release_cb(struct nfsd4_callback *cb) +static void nfsd4_release_cb(struct nfsd4_callback *cb) { if (cb->cb_ops->rpc_release) cb->cb_ops->rpc_release(cb); } -void nfsd4_process_cb_update(struct nfsd4_callback *cb) +/* requires cl_lock: */ +static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) +{ + struct nfsd4_session *s; + struct nfsd4_conn *c; + + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_flags & NFS4_CDFC4_BACK) + return c; + } + } + return NULL; +} + +static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { struct nfs4_cb_conn conn; struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = NULL; + struct nfsd4_conn *c; int err; /* @@ -768,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb) rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; } + if (clp->cl_cb_conn.cb_xprt) { + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + clp->cl_cb_conn.cb_xprt = NULL; + } if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) return; spin_lock(&clp->cl_lock); @@ -778,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb) BUG_ON(!clp->cl_cb_flags); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); + c = __nfsd4_find_backchannel(clp); + if (c) { + svc_xprt_get(c->cn_xprt); + conn.cb_xprt = c->cn_xprt; + ses = c->cn_session; + } spin_unlock(&clp->cl_lock); - err = setup_callback_client(clp, &conn); - if (err) + err = setup_callback_client(clp, &conn, ses); + if (err) { warn_no_callback_path(clp, err); + return; + } + /* Yay, the callback channel's back! Restart any callbacks: */ + list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) + run_nfsd4_cb(cb); } void nfsd4_do_callback_rpc(struct work_struct *w) @@ -807,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w) void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; + struct nfs4_client *clp = dp->dl_client; dp->dl_retries = 1; cb->cb_op = dp; - cb->cb_clp = dp->dl_client; + cb->cb_clp = clp; cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; @@ -819,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp) cb->cb_ops = &nfsd4_cb_recall_ops; dp->dl_retries = 1; - queue_work(callback_wq, &dp->dl_recall.cb_work); + INIT_LIST_HEAD(&cb->cb_per_client); + cb->cb_done = true; + + run_nfsd4_cb(&dp->dl_recall); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index f0695e815f0..6d2c397d458 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -33,10 +33,11 @@ */ #include <linux/module.h> -#include <linux/nfsd_idmap.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> +#include "idmap.h" +#include "nfsd.h" /* * Cache entry @@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp) return clp->name; } -static int +static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) { @@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen int ret; if (namelen + 1 > sizeof(key.name)) - return -EINVAL; + return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); if (ret == -ENOENT) - ret = -ESRCH; /* nfserr_badname */ + return nfserr_badowner; if (ret) - return ret; + return nfserrno(ret); *id = item->id; cache_put(&item->h, &nametoid_cache); return 0; @@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) return ret; } -int +__be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, __u32 *id) { return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); } -int +__be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, __u32 *id) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0cdfd022bb7..db52546143d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } -static __be32 -nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - void *arg) +static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) { struct svc_fh tmp_fh; __be32 ret; @@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = exp_pseudoroot(rqstp, &tmp_fh); if (ret) return ret; - if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { + if (tmp_fh.fh_dentry == fh->fh_dentry) { fh_put(&tmp_fh); return nfserr_noent; } fh_put(&tmp_fh); - return nfsd_lookup(rqstp, &cstate->current_fh, - "..", 2, &cstate->current_fh); + return nfsd_lookup(rqstp, fh, "..", 2, fh); +} + +static __be32 +nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + void *arg) +{ + return nfsd4_do_lookupp(rqstp, &cstate->current_fh); } static __be32 @@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else secinfo->si_exp = exp; dput(dentry); + if (cstate->minorversion) + /* See rfc 5661 section 2.6.3.1.1.8 */ + fh_put(&cstate->current_fh); return err; } static __be32 +nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_secinfo_no_name *sin) +{ + __be32 err; + + switch (sin->sin_style) { + case NFS4_SECINFO_STYLE4_CURRENT_FH: + break; + case NFS4_SECINFO_STYLE4_PARENT: + err = nfsd4_do_lookupp(rqstp, &cstate->current_fh); + if (err) + return err; + break; + default: + return nfserr_inval; + } + exp_get(cstate->current_fh.fh_export); + sin->sin_exp = cstate->current_fh.fh_export; + fh_put(&cstate->current_fh); + return nfs_ok; +} + +static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) { @@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum); * Also note, enforced elsewhere: * - SEQUENCE other than as first op results in * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) - * - BIND_CONN_TO_SESSION must be the only op in its compound - * (Will be enforced in nfsd4_bind_conn_to_session().) + * - BIND_CONN_TO_SESSION must be the only op in its compound. + * (Enforced in nfsd4_bind_conn_to_session().) * - DESTROY_SESSION must be the final operation in a compound, if * sessionid's in SEQUENCE and DESTROY_SESSION are the same. * (Enforced in nfsd4_destroy_session().) @@ -1126,10 +1156,6 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } - if (!rqstp->rq_usedeferral && status == nfserr_dropit) { - dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__); - status = nfserr_jukebox; - } resp->cstate.status = status; fh_put(&resp->cstate.current_fh); @@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, .op_name = "OP_EXCHANGE_ID", }, + [OP_BIND_CONN_TO_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_BIND_CONN_TO_SESSION", + }, [OP_CREATE_SESSION] = { .op_func = (nfsd4op_func)nfsd4_create_session, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, @@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_RECLAIM_COMPLETE", }, + [OP_SECINFO_NO_NAME] = { + .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_name = "OP_SECINFO_NO_NAME", + }, }; static const char *nfsd4_op_name(unsigned opnum) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 7e26caab2a2..ffb59ef6f82 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child) { int status; - /* note: we currently use this path only for minorversion 0 */ if (nfs4_has_reclaimed_state(child->d_name.name, false)) return 0; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 116cab970e0..d98d0213285 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; - nfs4_file_get_access(fp, O_RDONLY); + dp->dl_vfs_file = find_readable_file(fp); + get_file(dp->dl_vfs_file); dp->dl_flock = NULL; dp->dl_type = type; dp->dl_stateid.si_boot = boot_time; @@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); + fput(dp->dl_vfs_file); kmem_cache_free(deleg_slab, dp); num_delegations--; } @@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp) static void nfs4_close_delegation(struct nfs4_delegation *dp) { - struct file *filp = find_readable_file(dp->dl_file); - dprintk("NFSD: close_delegation dp %p\n",dp); + /* XXX: do we even need this check?: */ if (dp->dl_flock) - vfs_setlease(filp, F_UNLCK, &dp->dl_flock); - nfs4_file_put_access(dp->dl_file, O_RDONLY); + vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock); } /* Called under the state lock. */ @@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) free_conn(c); } spin_unlock(&clp->cl_lock); + nfsd4_probe_callback(clp); } static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) @@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn) return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } -static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) +static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir) { struct nfsd4_conn *conn; - u32 flags = NFS4_CDFC4_FORE; int ret; - if (ses->se_flags & SESSION4_BACK_CHAN) - flags |= NFS4_CDFC4_BACK; - conn = alloc_conn(rqstp, flags); + conn = alloc_conn(rqstp, dir); if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); @@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) return nfs_ok; } +static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses) +{ + u32 dir = NFS4_CDFC4_FORE; + + if (ses->se_flags & SESSION4_BACK_CHAN) + dir |= NFS4_CDFC4_BACK; + + return nfsd4_new_conn(rqstp, ses, dir); +} + +/* must be called under client_lock */ static void nfsd4_del_conns(struct nfsd4_session *s) { struct nfs4_client *clp = s->se_client; @@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n */ slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); + if (numslots < 1) + return NULL; new = alloc_session(slotsize, numslots); if (!new) { @@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n idx = hash_sessionid(&new->se_sessionid); spin_lock(&client_lock); list_add(&new->se_hash, &sessionid_hashtbl[idx]); + spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&clp->cl_lock); spin_unlock(&client_lock); - status = nfsd4_new_conn(rqstp, new); + status = nfsd4_new_conn_from_crses(rqstp, new); /* whoops: benny points out, status is ignored! (err, or bogus) */ if (status) { free_session(&new->se_ref); return NULL; } - if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) { + if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); - - clp->cl_cb_session = new; - clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt; - svc_xprt_get(rqstp->rq_xprt); + /* + * This is a little silly; with sessions there's no real + * use for the callback address. Use the peer address + * as a reasonable default for now, but consider fixing + * the rpc client not to require an address in the + * future: + */ rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); - nfsd4_probe_callback(clp); } + nfsd4_probe_callback(clp); return new; } @@ -817,7 +833,9 @@ static void unhash_session(struct nfsd4_session *ses) { list_del(&ses->se_hash); + spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); + spin_unlock(&ses->se_client->cl_lock); } /* must be called under the client_lock */ @@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp) mark_client_expired(clp); list_del(&clp->cl_lru); + spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) list_del_init(&ses->se_hash); + spin_unlock(&clp->cl_lock); } static void @@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); - atomic_set(&clp->cl_cb_set, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); spin_lock_init(&clp->cl_lock); INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); clp->cl_time = get_seconds(); @@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid) return NULL; } -/* - * Return 1 iff clp's clientid establishment method matches the use_exchange_id - * parameter. Matching is based on the fact the at least one of the - * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1 - * - * FIXME: we need to unify the clientid namespaces for nfsv4.x - * and correctly deal with client upgrade/downgrade in EXCHANGE_ID - * and SET_CLIENTID{,_CONFIRM} - */ -static inline int -match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id) +static bool clp_used_exchangeid(struct nfs4_client *clp) { - bool has_exchange_flags = (clp->cl_exchange_flags != 0); - return use_exchange_id == has_exchange_flags; -} + return clp->cl_exchange_flags != 0; +} static struct nfs4_client * -find_confirmed_client_by_str(const char *dname, unsigned int hashval, - bool use_exchange_id) +find_confirmed_client_by_str(const char *dname, unsigned int hashval) { struct nfs4_client *clp; list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname) && - match_clientid_establishment(clp, use_exchange_id)) + if (same_name(clp->cl_recdir, dname)) return clp; } return NULL; } static struct nfs4_client * -find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, - bool use_exchange_id) +find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) { struct nfs4_client *clp; list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname) && - match_clientid_establishment(clp, use_exchange_id)) + if (same_name(clp->cl_recdir, dname)) return clp; } return NULL; } +static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) +{ + switch (family) { + case AF_INET: + ((struct sockaddr_in *)sa)->sin_family = AF_INET; + ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; + return; + case AF_INET6: + ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; + ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; + return; + } +} + static void -gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { struct nfs4_cb_conn *conn = &clp->cl_cb_conn; + struct sockaddr *sa = svc_addr(rqstp); + u32 scopeid = rpc_get_scope_id(sa); unsigned short expected_family; /* Currently, we only support tcp and tcp6 for the callback channel */ @@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; + rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; @@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, case SP4_NONE: break; case SP4_SSV: - return nfserr_encr_alg_unsupp; + return nfserr_serverfault; default: BUG(); /* checked by xdr code */ case SP4_MACH_CRED: @@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, nfs4_lock_state(); status = nfs_ok; - conf = find_confirmed_client_by_str(dname, strhashval, true); + conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { + if (!clp_used_exchangeid(conf)) { + status = nfserr_clid_inuse; /* XXX: ? */ + goto out; + } if (!same_verf(&verf, &conf->cl_verifier)) { /* 18.35.4 case 8 */ if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { @@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, goto out; } - unconf = find_unconfirmed_client_by_str(dname, strhashval, true); + unconf = find_unconfirmed_client_by_str(dname, strhashval); if (unconf) { /* * Possible retry or client restart. Per 18.35.4 case 4, @@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfs_ok; memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); + memcpy(&cr_ses->fore_channel, &new->se_fchannel, + sizeof(struct nfsd4_channel_attrs)); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; @@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) return argp->opcnt == resp->opcnt; } +static __be32 nfsd4_map_bcts_dir(u32 *dir) +{ + switch (*dir) { + case NFS4_CDFC4_FORE: + case NFS4_CDFC4_BACK: + return nfs_ok; + case NFS4_CDFC4_FORE_OR_BOTH: + case NFS4_CDFC4_BACK_OR_BOTH: + *dir = NFS4_CDFC4_BOTH; + return nfs_ok; + }; + return nfserr_inval; +} + +__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 status; + + if (!nfsd4_last_compound_op(rqstp)) + return nfserr_not_only_op; + spin_lock(&client_lock); + cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); + /* Sorta weird: we only need the refcnt'ing because new_conn acquires + * client_lock iself: */ + if (cstate->session) { + nfsd4_get_session(cstate->session); + atomic_inc(&cstate->session->se_client->cl_refcount); + } + spin_unlock(&client_lock); + if (!cstate->session) + return nfserr_badsession; + + status = nfsd4_map_bcts_dir(&bcts->dir); + nfsd4_new_conn(rqstp, cstate->session, bcts->dir); + return nfs_ok; +} + static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) { if (!session) @@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r, spin_unlock(&client_lock); nfs4_lock_state(); - /* wait for callbacks */ - nfsd4_shutdown_callback(ses->se_client); + nfsd4_probe_callback_sync(ses->se_client); nfs4_unlock_state(); nfsd4_del_conns(ses); @@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, out: /* Hold a session reference until done processing the compound. */ if (cstate->session) { + struct nfs4_client *clp = session->se_client; + nfsd4_get_session(cstate->session); - atomic_inc(&session->se_client->cl_refcount); + atomic_inc(&clp->cl_refcount); + if (clp->cl_cb_state == NFSD4_CB_DOWN) + seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; } kfree(conn); spin_unlock(&client_lock); @@ -1775,7 +1846,6 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct sockaddr *sa = svc_addr(rqstp); struct xdr_netobj clname = { .len = setclid->se_namelen, .data = setclid->se_name, @@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, strhashval = clientstr_hashval(dname); nfs4_lock_state(); - conf = find_confirmed_client_by_str(dname, strhashval, false); + conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; + if (clp_used_exchangeid(conf)) + goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { char addr_str[INET6_ADDRSTRLEN]; rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, @@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * has a description of SETCLIENTID request processing consisting * of 5 bullet points, labeled as CASE0 - CASE4 below. */ - unconf = find_unconfirmed_client_by_str(dname, strhashval, false); + unconf = find_unconfirmed_client_by_str(dname, strhashval); status = nfserr_resource; if (!conf) { /* @@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * for consistent minorversion use throughout: */ new->cl_minorversion = 0; - gen_callback(new, setclid, rpc_get_scope_id(sa)); + gen_callback(new, setclid, rqstp); add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; @@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { - atomic_set(&conf->cl_cb_set, 0); nfsd4_change_callback(conf, &unconf->cl_cb_conn); nfsd4_probe_callback(conf); expire_client(unconf); @@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unsigned int hash = clientstr_hashval(unconf->cl_recdir); conf = find_confirmed_client_by_str(unconf->cl_recdir, - hash, false); + hash); if (conf) { nfsd4_remove_clid_dir(conf); expire_client(conf); @@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl) nfsd4_cb_recall(dp); } -/* - * The file_lock is being reapd. - * - * Called by locks_free_lock() with lock_flocks() held. - */ -static -void nfsd_release_deleg_cb(struct file_lock *fl) -{ - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; - - dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count)); - - if (!(fl->fl_flags & FL_LEASE) || !dp) - return; - dp->dl_flock = NULL; -} - -/* - * Called from setlease() with lock_flocks() held - */ -static -int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) -{ - struct nfs4_delegation *onlistd = - (struct nfs4_delegation *)onlist->fl_owner; - struct nfs4_delegation *tryd = - (struct nfs4_delegation *)try->fl_owner; - - if (onlist->fl_lmops != try->fl_lmops) - return 0; - - return onlistd->dl_client == tryd->dl_client; -} - - static int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) { @@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) static const struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, - .fl_release_private = nfsd_release_deleg_cb, - .fl_mylease = nfsd_same_client_deleg_cb, .fl_change = nfsd_change_deleg_cb, }; @@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &fp->fi_fds[oflag]); - if (status == nfserr_dropit) - status = nfserr_jukebox; if (status) return status; } @@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open) open->op_stateowner->so_client->cl_firststate = 1; } +/* Should we give out recallable state?: */ +static bool nfsd4_cb_channel_good(struct nfs4_client *clp) +{ + if (clp->cl_cb_state == NFSD4_CB_UP) + return true; + /* + * In the sessions case, since we don't have to establish a + * separate connection for callbacks, we assume it's OK + * until we hear otherwise: + */ + return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; +} + /* * Attempt to hand out a delegation. */ @@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta { struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; - int cb_up = atomic_read(&sop->so_client->cl_cb_set); + int cb_up; struct file_lock *fl; int status, flag = 0; + cb_up = nfsd4_cb_channel_good(sop->so_client); flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { @@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta dp->dl_flock = fl; /* vfs_setlease checks to see if delegation should be handed out. - * the lock_manager callbacks fl_mylease and fl_change are used + * the lock_manager callback fl_change is used */ if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); @@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) - && !atomic_read(&clp->cl_cb_set)) + && clp->cl_cb_state != NFSD4_CB_UP) goto out; status = nfs_ok; out: @@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (status) goto out; renew_client(dp->dl_client); - if (filpp) + if (filpp) { *filpp = find_readable_file(dp->dl_file); - BUG_ON(!*filpp); + BUG_ON(!*filpp); + } } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); if (!stp) @@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) unsigned int strhashval = clientstr_hashval(name); struct nfs4_client *clp; - clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); + clp = find_confirmed_client_by_str(name, strhashval); return clp ? 1 : 0; } @@ -4336,7 +4383,7 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { - cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); + cancel_delayed_work_sync(&laundromat_work); destroy_workqueue(laundry_wq); locks_end_grace(&nfsd4_manager); nfs4_lock_state(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f35a94a0402..956629b9cdc 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -44,13 +44,14 @@ #include <linux/namei.h> #include <linux/statfs.h> #include <linux/utsname.h> -#include <linux/nfsd_idmap.h> -#include <linux/nfs4_acl.h> #include <linux/sunrpc/svcauth_gss.h> +#include "idmap.h" +#include "acl.h" #include "xdr4.h" #include "vfs.h" + #define NFSDDBG_FACILITY NFSDDBG_XDR /* @@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); ace->whotype = nfs4_acl_get_whotype(buf, dummy32); - host_err = 0; + status = nfs_ok; if (ace->whotype != NFS4_ACL_WHO_NAMED) ace->who = 0; else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - host_err = nfsd_map_name_to_gid(argp->rqstp, + status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &ace->who); else - host_err = nfsd_map_name_to_uid(argp->rqstp, + status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &ace->who); - if (host_err) - goto out_nfserr; + if (status) + return status; } } else *acl = NULL; @@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access DECODE_TAIL; } +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + DECODE_HEAD; + u32 dummy; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); + COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + READ32(bcts->dir); + /* XXX: Perhaps Tom Tucker could help us figure out how we + * should be using ctsa_use_conn_in_rdma_mode: */ + READ32(dummy); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { @@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, } static __be32 +nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(sin->sin_style); + DECODE_TAIL; +} + +static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { __be32 status; @@ -1005,7 +1032,7 @@ static __be32 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, struct nfsd4_exchange_id *exid) { - int dummy; + int dummy, tmp; DECODE_HEAD; READ_BUF(NFS4_VERIFIER_SIZE); @@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, /* ssp_hash_algs<> */ READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); + READ32(tmp); + while (tmp--) { + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } /* ssp_encr_algs<> */ READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); + READ32(tmp); + while (tmp--) { + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } /* ssp_window and ssp_num_gss_handles */ READ_BUF(8); @@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { /* new operations for NFSv4.1 */ [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, @@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, case nfserr_resource: nfserr = nfserr_toosmall; goto fail; - case nfserr_dropit: - goto fail; case nfserr_noent: goto skip_entry; default: @@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ return nfserr; } +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); + WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(bcts->dir); + /* XXX: ? */ + WRITE32(0); + ADJUST_ARGS(); + } + return nfserr; +} + static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { @@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ } static __be32 -nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo *secinfo) +nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, + __be32 nfserr,struct svc_export *exp) { int i = 0; - struct svc_export *exp = secinfo->si_exp; u32 nflavs; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; @@ -2892,6 +2939,20 @@ out: return nfserr; } +static __be32 +nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo *secinfo) +{ + return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp); +} + +static __be32 +nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo_no_name *secinfo) +{ + return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp); +} + /* * The SETATTR encode routine is special -- it always encodes a bitmap, * regardless of the error status. @@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITE32(seq->seqid); WRITE32(seq->slotid); WRITE32(seq->maxslots); - /* - * FIXME: for now: - * target_maxslots = maxslots - * status_flags = 0 - */ + /* For now: target_maxslots = maxslots */ WRITE32(seq->maxslots); - WRITE32(0); + WRITE32(seq->status_flags); ADJUST_ARGS(); resp->cstate.datap = p; /* DRC cache data pointer */ @@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* NFSv4.1 operations */ [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, @@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4514ebbee4d..33b3e2b0677 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -8,12 +8,12 @@ #include <linux/namei.h> #include <linux/ctype.h> -#include <linux/nfsd_idmap.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> +#include "idmap.h" #include "nfsd.h" #include "cache.h" @@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { +#ifdef CONFIG_NFSD_DEPRECATED static int warned; if (file->f_dentry->d_name.name[0] == '.' && !warned) { printk(KERN_INFO @@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size current->comm, file->f_dentry->d_name.name); warned = 1; } +#endif if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 6b641cf2c19..7ecfa242030 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) #define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) #define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER) #define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) #define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) #define nfserr_grace cpu_to_be32(NFSERR_GRACE) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 08e17264784..e15dc45fc5e 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -735,9 +735,9 @@ nfserrno (int errno) { nfserr_stale, -ESTALE }, { nfserr_jukebox, -ETIMEDOUT }, { nfserr_jukebox, -ERESTARTSYS }, - { nfserr_dropit, -EAGAIN }, - { nfserr_dropit, -ENOMEM }, - { nfserr_badname, -ESRCH }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, { nfserr_io, -ETXTBSY }, { nfserr_notsupp, -EOPNOTSUPP }, { nfserr_toosmall, -ETOOSMALL }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2bae1d86f5f..18743c4d8bc 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit) { + if (nfserr == nfserr_dropit || rqstp->rq_dropme) { dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); return 0; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 39adc27b068..3074656ba7b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,10 +68,12 @@ typedef struct { struct nfsd4_callback { void *cb_op; struct nfs4_client *cb_clp; + struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; const struct rpc_call_ops *cb_ops; struct work_struct cb_work; + bool cb_done; }; struct nfs4_delegation { @@ -81,6 +83,7 @@ struct nfs4_delegation { atomic_t dl_count; /* ref count */ struct nfs4_client *dl_client; struct nfs4_file *dl_file; + struct file *dl_vfs_file; struct file_lock *dl_flock; u32 dl_type; time_t dl_time; @@ -95,6 +98,7 @@ struct nfs4_delegation { struct nfs4_cb_conn { /* SETCLIENTID info */ struct sockaddr_storage cb_addr; + struct sockaddr_storage cb_saddr; size_t cb_addrlen; u32 cb_prog; /* used only in 4.0 case; per-session otherwise */ @@ -146,6 +150,11 @@ struct nfsd4_create_session { u32 gid; }; +struct nfsd4_bind_conn_to_session { + struct nfs4_sessionid sessionid; + u32 dir; +}; + /* The single slot clientid cache structure */ struct nfsd4_clid_slot { u32 sl_seqid; @@ -235,9 +244,13 @@ struct nfs4_client { unsigned long cl_cb_flags; struct rpc_clnt *cl_cb_client; u32 cl_cb_ident; - atomic_t cl_cb_set; +#define NFSD4_CB_UP 0 +#define NFSD4_CB_UNKNOWN 1 +#define NFSD4_CB_DOWN 2 + int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; + struct list_head cl_callbacks; /* list of in-progress callbacks */ /* for all client information that callback code might need: */ spinlock_t cl_lock; @@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid); extern void nfs4_free_stateowner(struct kref *kref); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_do_callback_rpc(struct work_struct *); extern void nfsd4_cb_recall(struct nfs4_delegation *dp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 184938fcff0..641117f2188 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1,4 +1,3 @@ -#define MSNFS /* HACK HACK */ /* * File operations used by nfsd. Some of these have been ripped from * other parts of the kernel because they weren't exported, others @@ -35,8 +34,8 @@ #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 -#include <linux/nfs4_acl.h> -#include <linux/nfsd_idmap.h> +#include "acl.h" +#include "idmap.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" @@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, .dentry = dget(dentry)}; int err = 0; - while (d_mountpoint(path.dentry) && follow_down(&path)) - ; + err = follow_down(&path, false); + if (err < 0) + goto out; exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { @@ -273,6 +273,13 @@ out: return err; } +static int nfsd_break_lease(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + return break_lease(inode, O_WRONLY | O_NONBLOCK); +} + /* * Commit metadata changes to stable storage. */ @@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, goto out; } - /* - * If we are changing the size of the file, then - * we need to break all leases. - */ - host_err = break_lease(inode, O_WRONLY | O_NONBLOCK); - if (host_err == -EWOULDBLOCK) - host_err = -ETIMEDOUT; - if (host_err) /* ENOMEM or EWOULDBLOCK */ - goto out_nfserr; - host_err = get_write_access(inode); if (host_err) goto out_nfserr; @@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, err = nfserr_notsync; if (!check_guard || guardtime == inode->i_ctime.tv_sec) { + host_err = nfsd_break_lease(inode); + if (host_err) + goto out_nfserr; fh_lock(fhp); + host_err = notify_change(dentry, iap); err = nfserrno(host_err); fh_unlock(fhp); @@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, */ if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); - if (host_err == -EWOULDBLOCK) - host_err = -ETIMEDOUT; if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; @@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct page **pp = rqstp->rq_respages + rqstp->rq_resused; struct page *page = buf->page; size_t size; - int ret; - - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; size = sd->len; @@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static inline int svc_msnfs(struct svc_fh *ffhp) -{ -#ifdef MSNFS - return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS); -#else - return 0; -#endif -} - static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) @@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, err = nfserr_perm; inode = file->f_path.dentry->d_inode; - if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) - goto out; - if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { .len = 0, @@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, fsnotify_access(file); } else err = nfserrno(host_err); -out: return err; } @@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; -#ifdef MSNFS - err = nfserr_perm; - - if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt))) - goto out; -#endif - dentry = file->f_path.dentry; inode = dentry->d_inode; exp = fhp->fh_export; @@ -1050,7 +1023,6 @@ out_nfserr: err = 0; else err = nfserrno(host_err); -out: return err; } @@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserrno(host_err); goto out_dput; } + err = nfserr_noent; + if (!dold->d_inode) + goto out_drop_write; + host_err = nfsd_break_lease(dold->d_inode); + if (host_err) + goto out_drop_write; host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); @@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } +out_drop_write: mnt_drop_write(tfhp->fh_export->ex_path.mnt); out_dput: dput(dnew); @@ -1755,13 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ndentry == trap) goto out_dput_new; - if (svc_msnfs(ffhp) && - ((atomic_read(&odentry->d_count) > 1) - || (atomic_read(&ndentry->d_count) > 1))) { - host_err = -EPERM; - goto out_dput_new; - } - host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; @@ -1769,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (host_err) goto out_dput_new; + host_err = nfsd_break_lease(odentry->d_inode); + if (host_err) + goto out_drop_write; host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) host_err = commit_metadata(ffhp); } - +out_drop_write: mnt_drop_write(ffhp->fh_export->ex_path.mnt); - out_dput_new: dput(ndentry); out_dput_old: @@ -1840,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (host_err) goto out_nfserr; - if (type != S_IFDIR) { /* It's UNLINK */ -#ifdef MSNFS - if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (atomic_read(&rdentry->d_count) > 1)) { - host_err = -EPERM; - } else -#endif + host_err = nfsd_break_lease(rdentry->d_inode); + if (host_err) + goto out_put; + if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); - } else { /* It's RMDIR */ + else host_err = vfs_rmdir(dirp, rdentry); - } - +out_put: dput(rdentry); if (!host_err) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 60fce3dc5cb..366401e1a53 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -311,6 +311,11 @@ struct nfsd4_secinfo { struct svc_export *si_exp; /* response */ }; +struct nfsd4_secinfo_no_name { + u32 sin_style; /* request */ + struct svc_export *sin_exp; /* response */ +}; + struct nfsd4_setattr { stateid_t sa_stateid; /* request */ u32 sa_bmval[3]; /* request */ @@ -373,8 +378,8 @@ struct nfsd4_sequence { u32 cachethis; /* request */ #if 0 u32 target_maxslots; /* response */ - u32 status_flags; /* response */ #endif /* not yet */ + u32 status_flags; /* response */ }; struct nfsd4_destroy_session { @@ -422,6 +427,7 @@ struct nfsd4_op { /* NFSv4.1 */ struct nfsd4_exchange_id exchange_id; + struct nfsd4_bind_conn_to_session bind_conn_to_session; struct nfsd4_create_session create_session; struct nfsd4_destroy_session destroy_session; struct nfsd4_sequence sequence; @@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); extern __be32 nfsd4_create_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_create_session *); diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 8b782b062ba..3ee67c67cc5 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -35,7 +35,20 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) { - return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); + return NILFS_I_NILFS(bmap->b_inode)->ns_dat; +} + +static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, + const char *fname, int err) +{ + struct inode *inode = bmap->b_inode; + + if (err == -EINVAL) { + nilfs_error(inode->i_sb, fname, + "broken bmap (inode number=%lu)\n", inode->i_ino); + err = -EIO; + } + return err; } /** @@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, down_read(&bmap->b_sem); ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); - if (ret < 0) + if (ret < 0) { + ret = nilfs_bmap_convert_error(bmap, __func__, ret); goto out; + } if (NILFS_BMAP_USE_VBN(bmap)) { ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, &blocknr); @@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, down_read(&bmap->b_sem); ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); up_read(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) @@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap, down_write(&bmap->b_sem); ret = nilfs_bmap_do_insert(bmap, key, rec); up_write(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) @@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) down_read(&bmap->b_sem); ret = bmap->b_ops->bop_last_key(bmap, &lastkey); - if (!ret) - *key = lastkey; up_read(&bmap->b_sem); + + if (ret < 0) + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + else + *key = lastkey; return ret; } @@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) down_write(&bmap->b_sem); ret = nilfs_bmap_do_delete(bmap, key); up_write(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) @@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) down_write(&bmap->b_sem); ret = nilfs_bmap_do_truncate(bmap, key); up_write(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } /** @@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) down_write(&bmap->b_sem); ret = bmap->b_ops->bop_propagate(bmap, bh); up_write(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } /** @@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap, down_write(&bmap->b_sem); ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); up_write(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } /** @@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) down_write(&bmap->b_sem); ret = bmap->b_ops->bop_mark(bmap, key, level); up_write(&bmap->b_sem); - return ret; + + return nilfs_bmap_convert_error(bmap, __func__, ret); } /** diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5115814cb74..388e9e8f528 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -104,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { - struct inode *dat = - nilfs_dat_inode(NILFS_I_NILFS(inode)); + struct inode *dat = NILFS_I_NILFS(inode)->ns_dat; /* blocknr is a virtual block number */ err = nilfs_dat_translate(dat, blocknr, &pblocknr); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index cb003c8ee1f..9d45773b79e 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page, unsigned from, unsigned to) { struct inode *dir = mapping->host; - struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); loff_t pos = page_offset(page) + from; unsigned len = to - from; unsigned nr_dirty, copied; @@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page, i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) nilfs_set_transaction_flag(NILFS_TI_SYNC); - err = nilfs_set_file_dirty(sbi, dir, nr_dirty); + err = nilfs_set_file_dirty(dir, nr_dirty); WARN_ON(err); /* do not happen */ unlock_page(page); } diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index c9a30d7ff6f..2f560c9fb80 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = { .truncate = nilfs_truncate, .setattr = nilfs_setattr, .permission = nilfs_permission, + .fiemap = nilfs_fiemap, }; /* end of file */ diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 9f8a2da67f9..bfc73d3a30e 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -149,14 +149,9 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, } err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); - if (unlikely(err)) { - if (err == -EINVAL) - nilfs_error(sb, __func__, "ifile is broken"); - else - nilfs_warning(sb, __func__, - "unable to read inode: %lu", - (unsigned long) ino); - } + if (unlikely(err)) + nilfs_warning(sb, __func__, "unable to read inode: %lu", + (unsigned long) ino); return err; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 71d4bc8464e..2fd440d8d6b 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -58,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct nilfs_inode_info *ii = NILFS_I(inode); __u64 blknum = 0; int err = 0, ret; - struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); + struct inode *dat = NILFS_I_NILFS(inode)->ns_dat; unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; down_read(&NILFS_MDT(dat)->mi_sem); @@ -96,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, inode->i_ino, (unsigned long long)blkoff); err = 0; - } else if (err == -EINVAL) { - nilfs_error(inode->i_sb, __func__, - "broken bmap (inode=%lu)\n", - inode->i_ino); - err = -EIO; } nilfs_transaction_abort(inode->i_sb); goto out; @@ -109,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, nilfs_transaction_commit(inode->i_sb); /* never fails */ /* Error handling should be detailed */ set_buffer_new(bh_result); + set_buffer_delay(bh_result); map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed to proper value */ } else if (ret == -ENOENT) { @@ -185,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page) if (ret) { struct inode *inode = page->mapping->host; - struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); - nilfs_set_file_dirty(sbi, inode, nr_dirty); + nilfs_set_file_dirty(inode, nr_dirty); } return ret; } @@ -229,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, start + copied); copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); + nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); return err ? : copied; } @@ -425,13 +420,12 @@ static int __nilfs_read_inode(struct super_block *sb, struct nilfs_root *root, unsigned long ino, struct inode *inode) { - struct nilfs_sb_info *sbi = NILFS_SB(sb); - struct inode *dat = nilfs_dat_inode(sbi->s_nilfs); + struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; struct buffer_head *bh; struct nilfs_inode *raw_inode; int err; - down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); if (unlikely(err)) goto bad_inode; @@ -461,7 +455,7 @@ static int __nilfs_read_inode(struct super_block *sb, } nilfs_ifile_unmap_inode(root->ifile, ino, bh); brelse(bh); - up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); return 0; @@ -470,7 +464,7 @@ static int __nilfs_read_inode(struct super_block *sb, brelse(bh); bad_inode: - up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); return err; } @@ -629,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, if (!test_bit(NILFS_I_BMAP, &ii->i_state)) return; - repeat: +repeat: ret = nilfs_bmap_last_key(ii->i_bmap, &b); if (ret == -ENOENT) return; @@ -646,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, nilfs_bmap_truncate(ii->i_bmap, b) == 0)) goto repeat; - failed: - if (ret == -EINVAL) - nilfs_error(ii->vfs_inode.i_sb, __func__, - "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); - else - nilfs_warning(ii->vfs_inode.i_sb, __func__, - "failed to truncate bmap (ino=%lu, err=%d)", - ii->vfs_inode.i_ino, ret); +failed: + nilfs_warning(ii->vfs_inode.i_sb, __func__, + "failed to truncate bmap (ino=%lu, err=%d)", + ii->vfs_inode.i_ino, ret); } void nilfs_truncate(struct inode *inode) @@ -682,7 +672,7 @@ void nilfs_truncate(struct inode *inode) nilfs_set_transaction_flag(NILFS_TI_SYNC); nilfs_mark_inode_dirty(inode); - nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); + nilfs_set_file_dirty(inode, 0); nilfs_transaction_commit(sb); /* May construct a logical segment and may fail in sync mode. But truncate has no return value. */ @@ -785,20 +775,24 @@ out_err: return err; } -int nilfs_permission(struct inode *inode, int mask) +int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root = NILFS_I(inode)->i_root; + struct nilfs_root *root; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } -int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, - struct buffer_head **pbh) +int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) { + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); struct nilfs_inode_info *ii = NILFS_I(inode); int err; @@ -839,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode) return ret; } -int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, - unsigned nr_dirty) +int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) { + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); struct nilfs_inode_info *ii = NILFS_I(inode); atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); @@ -874,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, int nilfs_mark_inode_dirty(struct inode *inode) { - struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); struct buffer_head *ibh; int err; - err = nilfs_load_inode_block(sbi, inode, &ibh); + err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { nilfs_warning(inode->i_sb, __func__, "failed to reget inode block.\n"); @@ -920,3 +913,134 @@ void nilfs_dirty_inode(struct inode *inode) nilfs_mark_inode_dirty(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ } + +int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + struct the_nilfs *nilfs = NILFS_I_NILFS(inode); + __u64 logical = 0, phys = 0, size = 0; + __u32 flags = 0; + loff_t isize; + sector_t blkoff, end_blkoff; + sector_t delalloc_blkoff; + unsigned long delalloc_blklen; + unsigned int blkbits = inode->i_blkbits; + int ret, n; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + + blkoff = start >> blkbits; + end_blkoff = (start + len - 1) >> blkbits; + + delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, + &delalloc_blkoff); + + do { + __u64 blkphy; + unsigned int maxblocks; + + if (delalloc_blklen && blkoff == delalloc_blkoff) { + if (size) { + /* End of the current extent */ + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, flags); + if (ret) + break; + } + if (blkoff > end_blkoff) + break; + + flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; + logical = blkoff << blkbits; + phys = 0; + size = delalloc_blklen << blkbits; + + blkoff = delalloc_blkoff + delalloc_blklen; + delalloc_blklen = nilfs_find_uncommitted_extent( + inode, blkoff, &delalloc_blkoff); + continue; + } + + /* + * Limit the number of blocks that we look up so as + * not to get into the next delayed allocation extent. + */ + maxblocks = INT_MAX; + if (delalloc_blklen) + maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, + maxblocks); + blkphy = 0; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + n = nilfs_bmap_lookup_contig( + NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + if (n < 0) { + int past_eof; + + if (unlikely(n != -ENOENT)) + break; /* error */ + + /* HOLE */ + blkoff++; + past_eof = ((blkoff << blkbits) >= isize); + + if (size) { + /* End of the current extent */ + + if (past_eof) + flags |= FIEMAP_EXTENT_LAST; + + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, flags); + if (ret) + break; + size = 0; + } + if (blkoff > end_blkoff || past_eof) + break; + } else { + if (size) { + if (phys && blkphy << blkbits == phys + size) { + /* The current extent goes on */ + size += n << blkbits; + } else { + /* Terminate the current extent */ + ret = fiemap_fill_next_extent( + fieinfo, logical, phys, size, + flags); + if (ret || blkoff > end_blkoff) + break; + + /* Start another extent */ + flags = FIEMAP_EXTENT_MERGED; + logical = blkoff << blkbits; + phys = blkphy << blkbits; + size = n << blkbits; + } + } else { + /* Start a new extent */ + flags = FIEMAP_EXTENT_MERGED; + logical = blkoff << blkbits; + phys = blkphy << blkbits; + size = n << blkbits; + } + blkoff += n; + } + cond_resched(); + } while (true); + + /* If ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index b185e937a33..496738963fd 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -233,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, int ret; down_read(&nilfs->ns_segctor_sem); - ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs); + ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs); up_read(&nilfs->ns_segctor_sem); return ret; } @@ -242,8 +242,7 @@ static ssize_t nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) { - struct inode *dat = nilfs_dat_inode(nilfs); - struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; struct nilfs_bdesc *bdescs = buf; int ret, i; @@ -421,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, size_t nmembs = argv->v_nmembs; int ret; - ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); + ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs); return (ret < 0) ? ret : nmembs; } @@ -430,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { size_t nmembs = argv->v_nmembs; - struct inode *dat = nilfs_dat_inode(nilfs); - struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; struct nilfs_bdesc *bdescs = buf; int ret, i; @@ -450,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, /* skip dead block */ continue; if (bdescs[i].bd_level == 0) { - ret = nilfs_mdt_mark_block_dirty(dat, + ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat, bdescs[i].bd_offset); if (ret < 0) { WARN_ON(ret == -ENOENT); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 39a5b84e2c9..6a0e2a189f6 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -237,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, * * %-ENOENT - the specified block does not exist (hole block) * - * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) - * * %-EROFS - Read only filesystem (for create mode) */ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, @@ -273,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error - * - * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) */ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) { @@ -350,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) * %-EIO - I/O error * * %-ENOENT - the specified block does not exist (hole block) - * - * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) */ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) { @@ -499,31 +493,29 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct buffer_head *bh_frozen; struct page *page; int blkbits = inode->i_blkbits; - int ret = -ENOMEM; page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); if (!page) - return ret; + return -ENOMEM; if (!page_has_buffers(page)) create_empty_buffers(page, 1 << blkbits, 0); bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); - if (bh_frozen) { - if (!buffer_uptodate(bh_frozen)) - nilfs_copy_buffer(bh_frozen, bh); - if (list_empty(&bh_frozen->b_assoc_buffers)) { - list_add_tail(&bh_frozen->b_assoc_buffers, - &shadow->frozen_buffers); - set_buffer_nilfs_redirected(bh); - } else { - brelse(bh_frozen); /* already frozen */ - } - ret = 0; + + if (!buffer_uptodate(bh_frozen)) + nilfs_copy_buffer(bh_frozen, bh); + if (list_empty(&bh_frozen->b_assoc_buffers)) { + list_add_tail(&bh_frozen->b_assoc_buffers, + &shadow->frozen_buffers); + set_buffer_nilfs_redirected(bh); + } else { + brelse(bh_frozen); /* already frozen */ } + unlock_page(page); page_cache_release(page); - return ret; + return 0; } struct buffer_head * diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 6e9557ecf16..98034271cd0 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -577,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = { .rename = nilfs_rename, .setattr = nilfs_setattr, .permission = nilfs_permission, + .fiemap = nilfs_fiemap, }; const struct inode_operations nilfs_special_inode_operations = { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f7560da5a56..777e8fd0430 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -190,11 +190,6 @@ static inline int nilfs_doing_construction(void) return nilfs_test_transaction_flag(NILFS_TI_WRITER); } -static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) -{ - return nilfs->ns_dat; -} - /* * function prototype */ @@ -256,14 +251,14 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); extern int nilfs_setattr(struct dentry *, struct iattr *); -int nilfs_permission(struct inode *inode, int mask); -extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, - struct buffer_head **); +int nilfs_permission(struct inode *inode, int mask, unsigned int flags); +int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); -extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, - unsigned); +int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); extern int nilfs_mark_inode_dirty(struct inode *); extern void nilfs_dirty_inode(struct inode *); +int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index a6c3c2e817f..0c432416cfe 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -491,7 +491,7 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, } return nc; } - + void nilfs_mapping_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); @@ -546,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page) } return TestClearPageDirty(page); } + +/** + * nilfs_find_uncommitted_extent - find extent of uncommitted data + * @inode: inode + * @start_blk: start block offset (in) + * @blkoff: start offset of the found extent (out) + * + * This function searches an extent of buffers marked "delayed" which + * starts from a block offset equal to or larger than @start_blk. If + * such an extent was found, this will store the start offset in + * @blkoff and return its length in blocks. Otherwise, zero is + * returned. + */ +unsigned long nilfs_find_uncommitted_extent(struct inode *inode, + sector_t start_blk, + sector_t *blkoff) +{ + unsigned int i; + pgoff_t index; + unsigned int nblocks_in_page; + unsigned long length = 0; + sector_t b; + struct pagevec pvec; + struct page *page; + + if (inode->i_mapping->nrpages == 0) + return 0; + + index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + +repeat: + pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, + pvec.pages); + if (pvec.nr == 0) + return length; + + if (length > 0 && pvec.pages[0]->index > index) + goto out; + + b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + i = 0; + do { + page = pvec.pages[i]; + + lock_page(page); + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (b < start_blk) + continue; + if (buffer_delay(bh)) { + if (length == 0) + *blkoff = b; + length++; + } else if (length > 0) { + goto out_locked; + } + } while (++b, bh = bh->b_this_page, bh != head); + } else { + if (length > 0) + goto out_locked; + + b += nblocks_in_page; + } + unlock_page(page); + + } while (++i < pagevec_count(&pvec)); + + index = page->index + 1; + pagevec_release(&pvec); + cond_resched(); + goto repeat; + +out_locked: + unlock_page(page); +out: + pagevec_release(&pvec); + return length; +} diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index fb9e8a8a203..622df27cd89 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -66,6 +66,9 @@ void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); +unsigned long nilfs_find_uncommitted_extent(struct inode *inode, + sector_t start_blk, + sector_t *blkoff); #define NILFS_PAGE_BUG(page, m, a...) \ do { nilfs_page_bug(page); BUG(); } while (0) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 5d2711c28da..3dfcd3b7d38 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -535,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, if (unlikely(err)) goto failed_page; - err = nilfs_set_file_dirty(sbi, inode, 1); + err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) goto failed_page; diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h index 35a07157b98..7a17715f215 100644 --- a/fs/nilfs2/sb.h +++ b/fs/nilfs2/sb.h @@ -27,14 +27,6 @@ #include <linux/types.h> #include <linux/fs.h> -/* - * Mount options - */ -struct nilfs_mount_options { - unsigned long mount_opt; - __u64 snapshot_cno; -}; - struct the_nilfs; struct nilfs_sc_info; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 687d090cea3..55ebae5c7f3 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -504,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, return err; } -static int nilfs_handle_bmap_error(int err, const char *fname, - struct inode *inode, struct super_block *sb) -{ - if (err == -EINVAL) { - nilfs_error(sb, fname, "broken bmap (inode=%lu)\n", - inode->i_ino); - err = -EIO; - } - return err; -} - /* * Callback functions that enumerate, mark, and collect dirty blocks */ @@ -524,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci, int err; err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); - if (unlikely(err < 0)) - return nilfs_handle_bmap_error(err, __func__, inode, - sci->sc_super); + if (err < 0) + return err; err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(struct nilfs_binfo_v)); @@ -539,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { - int err; - - err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); - if (unlikely(err < 0)) - return nilfs_handle_bmap_error(err, __func__, inode, - sci->sc_super); - return 0; + return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); } static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, @@ -588,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, int err; err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); - if (unlikely(err < 0)) - return nilfs_handle_bmap_error(err, __func__, inode, - sci->sc_super); + if (err < 0) + return err; err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); if (!err) @@ -776,9 +757,8 @@ static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, ret++; if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) ret++; - if (ret || nilfs_doing_gc()) - if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) - ret++; + if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat)) + ret++; return ret; } @@ -814,7 +794,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) nilfs_mdt_clear_dirty(sci->sc_root->ifile); nilfs_mdt_clear_dirty(nilfs->ns_cpfile); nilfs_mdt_clear_dirty(nilfs->ns_sufile); - nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); + nilfs_mdt_clear_dirty(nilfs->ns_dat); } static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) @@ -923,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs->ns_nongc_ctime : sci->sc_seg_ctime); raw_sr->sr_flags = 0; - nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr + + nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr + NILFS_SR_DAT_OFFSET(isz), 1); nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + NILFS_SR_CPFILE_OFFSET(isz), 1); @@ -1179,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_stage.scnt++; /* Fall through */ case NILFS_ST_DAT: dat_stage: - err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), + err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, &nilfs_sc_dat_ops); if (unlikely(err)) break; @@ -1563,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, return 0; failed_bmap: - err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super); return err; } @@ -1783,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err) if (!err) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); + clear_buffer_delay(bh); clear_buffer_nilfs_volatile(bh); } brelse(bh); /* for b_assoc_buffers */ @@ -1909,6 +1889,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); + clear_buffer_delay(bh); clear_buffer_nilfs_volatile(bh); clear_buffer_nilfs_redirected(bh); if (bh == segbuf->sb_super_root) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f804d41ec9d..58fd707174e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -47,7 +47,6 @@ #include <linux/crc32.h> #include <linux/vfs.h> #include <linux/writeback.h> -#include <linux/kobject.h> #include <linux/seq_file.h> #include <linux/mount.h> #include "nilfs.h" @@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); if (!(sb->s_flags & MS_RDONLY)) { @@ -136,13 +140,17 @@ void nilfs_error(struct super_block *sb, const char *function, void nilfs_warning(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "NILFS warning (device %s): %s: ", - sb->s_id, function); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } @@ -162,10 +170,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) return &ii->vfs_inode; } -void nilfs_destroy_inode(struct inode *inode) +static void nilfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + INIT_LIST_HEAD(&inode->i_dentry); + if (mdi) { kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); @@ -173,6 +184,11 @@ void nilfs_destroy_inode(struct inode *inode) kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } +void nilfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nilfs_i_callback); +} + static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) { struct the_nilfs *nilfs = sbi->s_nilfs; @@ -688,7 +704,8 @@ skip_mount_setup: sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); /* synchronize sbp[1] with sbp[0] */ - memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); } @@ -838,7 +855,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, static int nilfs_tree_was_touched(struct dentry *root_dentry) { - return atomic_read(&root_dentry->d_count) > 1; + return root_dentry->d_count > 1; } /** @@ -1002,11 +1019,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; unsigned long old_sb_flags; - struct nilfs_mount_options old_opts; + unsigned long old_mount_opt; int err; old_sb_flags = sb->s_flags; - old_opts.mount_opt = sbi->s_mount_opt; + old_mount_opt = sbi->s_mount_opt; if (!parse_options(data, sb, 1)) { err = -EINVAL; @@ -1075,7 +1092,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) restore_opts: sb->s_flags = old_sb_flags; - sbi->s_mount_opt = old_opts.mount_opt; + sbi->s_mount_opt = old_mount_opt; return err; } @@ -1147,14 +1164,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; struct dentry *root_dentry; int err, s_new = false; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) return ERR_CAST(sd.bdev); @@ -1233,7 +1250,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s_new) - close_bdev_exclusive(sd.bdev, mode); + blkdev_put(sd.bdev, mode); return root_dentry; @@ -1242,7 +1259,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, failed: if (!s_new) - close_bdev_exclusive(sd.bdev, mode); + blkdev_put(sd.bdev, mode); return ERR_PTR(err); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 0254be2d73c..ad4ac607cf5 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -329,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) printk(KERN_INFO "NILFS: recovery complete.\n"); skip_recovery: - set_nilfs_loaded(nilfs); nilfs_clear_recovery_info(&ri); sbi->s_super->s_flags = s_flags; return 0; @@ -651,12 +650,11 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { - struct inode *dat = nilfs_dat_inode(nilfs); unsigned long ncleansegs; - down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); - up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; return 0; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 69226e14b74..fd85e4c05c6 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -36,8 +36,6 @@ /* the_nilfs struct */ enum { THE_NILFS_INIT = 0, /* Information from super_block is set */ - THE_NILFS_LOADED, /* Roll-back/roll-forward has done and - the latest checkpoint was loaded */ THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ THE_NILFS_GC_RUNNING, /* gc process is running */ THE_NILFS_SB_DIRTY, /* super block is dirty */ @@ -178,7 +176,6 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \ } THE_NILFS_FNS(INIT, init) -THE_NILFS_FNS(LOADED, loaded) THE_NILFS_FNS(DISCONTINUED, discontinued) THE_NILFS_FNS(GC_RUNNING, gc_running) THE_NILFS_FNS(SB_DIRTY, sb_dirty) diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 3ac36b7bf6b..7dceff005a6 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -6,7 +6,7 @@ config FANOTIFY ---help--- Say Y here to enable fanotify suport. fanotify is a file access notification system which differs from inotify in that it sends - and open file descriptor to the userspace listener along with + an open file descriptor to the userspace listener along with the event. If unsure, say Y. diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 20dc218707c..79b47cbb5cd 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) /* determine if the children should tell inode about their events */ watched = fsnotify_inode_watches_children(inode); - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ list_for_each_entry(alias, &inode->i_dentry, d_alias) { @@ -68,19 +68,21 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ + spin_lock(&alias->d_lock); list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { if (!child->d_inode) continue; - spin_lock(&child->d_lock); + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (watched) child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } + spin_unlock(&alias->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } /* Notify this dentry's parent about a child's events. */ diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index 58b6be99254..4ff028fcfd6 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ unistr.o upcase.o -EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\" +EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\" ifeq ($(CONFIG_NTFS_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 113ebd9f25a..f4b1057abdd 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,7 +1,7 @@ /* * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp, * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s * single-segment behaviour. * - * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both - * when atomic and when not atomic. This is ok because - * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic() - * and it is ok to call this when non-atomic. - * Infact, the only difference between __copy_from_user_inatomic() and + * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when + * atomic and when not atomic. This is ok because it calls + * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In + * fact, the only difference between __copy_from_user_inatomic() and * __copy_from_user() is that the latter calls might_sleep() and the former - * should not zero the tail of the buffer on error. And on many - * architectures __copy_from_user_inatomic() is just defined to - * __copy_from_user() so it makes no difference at all on those architectures. + * should not zero the tail of the buffer on error. And on many architectures + * __copy_from_user_inatomic() is just defined to __copy_from_user() so it + * makes no difference at all on those architectures. */ static inline size_t ntfs_copy_from_user_iovec(struct page **pages, unsigned nr_pages, unsigned ofs, const struct iovec **iov, @@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages, if (unlikely(copied != len)) { /* Do it the slow way. */ addr = kmap(*pages); - copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, - *iov, *iov_ofs, len); - /* - * Zero the rest of the target like __copy_from_user(). - */ - memset(addr + ofs + copied, 0, len - copied); - kunmap(*pages); + copied = __ntfs_copy_from_user_iovec_inatomic(addr + + ofs, *iov, *iov_ofs, len); if (unlikely(copied != len)) goto err_out; + kunmap(*pages); } total += len; + ntfs_set_next_iovec(iov, iov_ofs, len); bytes -= len; if (!bytes) break; - ntfs_set_next_iovec(iov, iov_ofs, len); ofs = 0; } while (++pages < last_page); out: return total; err_out: - total += copied; + BUG_ON(copied > len); /* Zero the rest of the target like __copy_from_user(). */ + memset(addr + ofs + copied, 0, len - copied); + kunmap(*pages); + total += copied; + ntfs_set_next_iovec(iov, iov_ofs, copied); while (++pages < last_page) { bytes -= len; if (!bytes) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 93622b175fc..a627ed82c0a 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) return NULL; } +static void ntfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); +} + void ntfs_destroy_big_inode(struct inode *inode) { ntfs_inode *ni = NTFS_I(inode); @@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode) BUG_ON(ni->page); if (!atomic_dec_and_test(&ni->count)) BUG(); - kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); + call_rcu(&inode->i_rcu, ntfs_i_callback); } static inline ntfs_inode *ntfs_alloc_extent_inode(void) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index a30ecacc01f..29099a07b9f 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void) ntfs_sysctl(0); } -MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov"); +MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 0d840669698..77a8de5f711 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -1,7 +1,6 @@ config OCFS2_FS tristate "OCFS2 file system support" - depends on NET && SYSFS - select CONFIGFS_FS + depends on NET && SYSFS && CONFIGFS_FS select JBD2 select CRC32 select QUOTA @@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER config OCFS2_FS_STATS bool "OCFS2 statistics" - depends on OCFS2_FS + depends on OCFS2_FS && DEBUG_FS default y help This option allows some fs statistics to be captured. Enabling diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 391915093fe..704f6b1742f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_check_acl(struct inode *inode, int mask) +int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; int ret = -EAGAIN; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return ret; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 5c5d31f0585..4fe7c9cf4bf 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,7 +26,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -extern int ocfs2_check_acl(struct inode *, int); +extern int ocfs2_check_acl(struct inode *, int, unsigned int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 592fae5007d..e4984e259cb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) return ret; } -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); static void ocfs2_adjust_rightmost_records(handle_t *handle, @@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, tl_bh); + osb->truncated_clusters += num_clusters; bail: mlog_exit(status); return status; @@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, i--; } + osb->truncated_clusters = 0; + bail: mlog_exit(status); return status; @@ -7139,64 +7141,6 @@ bail: } /* - * Expects the inode to already be locked. - */ -int ocfs2_prepare_truncate(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context **tc) -{ - int status; - unsigned int new_i_clusters; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct buffer_head *last_eb_bh = NULL; - - mlog_entry_void(); - - *tc = NULL; - - new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, - i_size_read(inode)); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - - mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" - "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, - (unsigned long long)le64_to_cpu(fe->i_size)); - - *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); - if (!(*tc)) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); - - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_extent_block(INODE_CACHE(inode), - le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - } - - (*tc)->tc_last_eb_bh = last_eb_bh; - - status = 0; -bail: - if (status < 0) { - if (*tc) - ocfs2_free_truncate_context(*tc); - *tc = NULL; - } - mlog_exit_void(); - return status; -} - -/* * 'start' is inclusive, 'end' is not. */ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, @@ -7270,18 +7214,3 @@ out_commit: out: return ret; } - -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) -{ - /* - * The caller is responsible for completing deallocation - * before freeing the context. - */ - if (tc->tc_dealloc.c_first_suballocator != NULL) - mlog(ML_NOTICE, - "Truncate completion has non-empty dealloc context\n"); - - brelse(tc->tc_last_eb_bh); - - kfree(tc); -} diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 55762b554b9..3bd08a03251 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -228,10 +228,6 @@ struct ocfs2_truncate_context { int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end); -int ocfs2_prepare_truncate(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context **tc); int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0d7c5540ad6..1fbb0e20131 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1630,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + mutex_lock(&osb->osb_tl_inode->i_mutex); + truncated_clusters = osb->truncated_clusters; + mutex_unlock(&osb->osb_tl_inode->i_mutex); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + int ocfs2_write_begin_nolock(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -1637,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp, struct buffer_head *di_bh, struct page *mmap_page) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; - unsigned int clusters_to_alloc, extents_to_split; + unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -1646,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp, struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; struct ocfs2_extent_tree et; + int try_free = 1, ret1; +try_again: ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { mlog_errno(ret); @@ -1681,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp, mlog_errno(ret); goto out; } else if (ret == 1) { + clusters_need = wc->w_clen; ret = ocfs2_refcount_cow(inode, filp, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { @@ -1695,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp, mlog_errno(ret); goto out; } + clusters_need += clusters_to_alloc; di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; @@ -1817,6 +1858,22 @@ out: ocfs2_free_alloc_context(data_ac); if (meta_ac) ocfs2_free_alloc_context(meta_ac); + + if (ret == -ENOSPC && try_free) { + /* + * Try to free some truncate log so that we can have enough + * clusters to allocate. + */ + try_free = 0; + + ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need); + if (ret1 == 1) + goto try_again; + + if (ret1 < 0) + mlog_errno(ret1); + } + return ret; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9f26ac9be2a..b108e863d8f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_REGION_LIVENODES 4 #define O2HB_DB_TYPE_REGION_NUMBER 5 #define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 +#define O2HB_DB_TYPE_REGION_PINNED 7 struct o2hb_debug_buf { int db_type; int db_size; @@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" #define O2HB_DEBUG_REGION_NUMBER "num" #define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" +#define O2HB_DEBUG_REGION_PINNED "pinned" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; +/* + * o2hb_dependent_users tracks the number of registered callbacks that depend + * on heartbeat. o2net and o2dlm are two entities that register this callback. + * However only o2dlm depends on the heartbeat. It does not want the heartbeat + * to stop while a dlm domain is still active. + */ +unsigned int o2hb_dependent_users; + +/* + * In global heartbeat mode, all regions are pinned if there are one or more + * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All + * regions are unpinned if the region count exceeds the cut off or the number + * of dependent users falls to zero. + */ +#define O2HB_PIN_CUT_OFF 3 + +/* + * In local heartbeat mode, we assume the dlm domain name to be the same as + * region uuid. This is true for domains created for the file system but not + * necessarily true for userdlm domains. This is a known limitation. + * + * In global heartbeat mode, we pin/unpin all o2hb regions. This solution + * works for both file system and userdlm domains. + */ +static int o2hb_region_pin(const char *region_uuid); +static void o2hb_region_unpin(const char *region_uuid); + /* Only sets a new threshold if there are no active regions. * * No locking or otherwise interesting code is required for reading @@ -186,7 +215,9 @@ struct o2hb_region { struct config_item hr_item; struct list_head hr_all_item; - unsigned hr_unclean_stop:1; + unsigned hr_unclean_stop:1, + hr_item_pinned:1, + hr_item_dropped:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -212,9 +243,11 @@ struct o2hb_region { struct dentry *hr_debug_livenodes; struct dentry *hr_debug_regnum; struct dentry *hr_debug_elapsed_time; + struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; + struct o2hb_debug_buf *hr_db_pinned; /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have @@ -307,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) static void o2hb_disarm_write_timeout(struct o2hb_region *reg) { - cancel_delayed_work(®->hr_write_timeout_work); - flush_scheduled_work(); + cancel_delayed_work_sync(®->hr_write_timeout_work); } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -702,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, config_item_name(®->hr_item)); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + + /* + * If global heartbeat active, unpin all regions if the + * region count > CUT_OFF + */ + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) + o2hb_region_unpin(NULL); } static int o2hb_check_slot(struct o2hb_region *reg, @@ -1042,6 +1082,9 @@ static int o2hb_thread(void *data) set_user_nice(current, -20); + /* Pin node */ + o2nm_depend_this_node(); + while (!kthread_should_stop() && !reg->hr_unclean_stop) { /* We track the time spent inside * o2hb_do_disk_heartbeat so that we avoid more than @@ -1091,6 +1134,9 @@ static int o2hb_thread(void *data) mlog_errno(ret); } + /* Unpin node */ + o2nm_undepend_this_node(); + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); return 0; @@ -1143,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) reg->hr_last_timeout_start)); goto done; + case O2HB_DB_TYPE_REGION_PINNED: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + !!reg->hr_item_pinned); + goto done; + default: goto done; } @@ -1316,6 +1368,8 @@ int o2hb_init(void) memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + o2hb_dependent_users = 0; + return o2hb_debug_init(); } @@ -1385,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item) debugfs_remove(reg->hr_debug_livenodes); debugfs_remove(reg->hr_debug_regnum); debugfs_remove(reg->hr_debug_elapsed_time); + debugfs_remove(reg->hr_debug_pinned); debugfs_remove(reg->hr_debug_dir); spin_lock(&o2hb_live_lock); @@ -1674,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out; reg->hr_bdev = I_BDEV(filp->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); if (ret) { reg->hr_bdev = NULL; goto out; @@ -1949,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) goto bail; } + reg->hr_debug_pinned = + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, + reg->hr_debug_dir, + &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, + 0, 0, reg); + if (!reg->hr_debug_pinned) { + mlog_errno(ret); + goto bail; + } + ret = 0; bail: return ret; @@ -2003,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, { struct task_struct *hb_task; struct o2hb_region *reg = to_o2hb_region(item); + int quorum_region = 0; /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); if (o2hb_global_heartbeat_active()) { clear_bit(reg->hr_region_num, o2hb_region_bitmap); clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + quorum_region = 1; + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); } hb_task = reg->hr_task; reg->hr_task = NULL; + reg->hr_item_dropped = 1; spin_unlock(&o2hb_live_lock); if (hb_task) @@ -2029,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (o2hb_global_heartbeat_active()) printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", config_item_name(®->hr_item)); + config_item_put(item); + + if (!o2hb_global_heartbeat_active() || !quorum_region) + return; + + /* + * If global heartbeat active and there are dependent users, + * pin all regions if quorum region count <= CUT_OFF + */ + spin_lock(&o2hb_live_lock); + + if (!o2hb_dependent_users) + goto unlock; + + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); } struct o2hb_heartbeat_group_attribute { @@ -2215,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, } EXPORT_SYMBOL_GPL(o2hb_setup_callback); -static struct o2hb_region *o2hb_find_region(const char *region_uuid) +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only pin the matching region. In global we pin all the active + * regions. + */ +static int o2hb_region_pin(const char *region_uuid) { - struct o2hb_region *p, *reg = NULL; + int ret = 0, found = 0; + struct o2hb_region *reg; + char *uuid; assert_spin_locked(&o2hb_live_lock); - list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { - if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { - reg = p; - break; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + uuid = config_item_name(®->hr_item); + + /* local heartbeat */ + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } + + if (reg->hr_item_pinned || reg->hr_item_dropped) + goto skip_pin; + + /* Ignore ENOENT only for local hb (userdlm domain) */ + ret = o2nm_depend_item(®->hr_item); + if (!ret) { + mlog(ML_CLUSTER, "Pin region %s\n", uuid); + reg->hr_item_pinned = 1; + } else { + if (ret == -ENOENT && found) + ret = 0; + else { + mlog(ML_ERROR, "Pin region %s fails with %d\n", + uuid, ret); + break; + } } +skip_pin: + if (found) + break; } - return reg; + return ret; } -static int o2hb_region_get(const char *region_uuid) +/* + * In local heartbeat mode, region_uuid passed matches the dlm domain name. + * In global heartbeat mode, region_uuid passed is NULL. + * + * In local, we only unpin the matching region. In global we unpin all the + * active regions. + */ +static void o2hb_region_unpin(const char *region_uuid) { - int ret = 0; struct o2hb_region *reg; + char *uuid; + int found = 0; - spin_lock(&o2hb_live_lock); + assert_spin_locked(&o2hb_live_lock); - reg = o2hb_find_region(region_uuid); - if (!reg) - ret = -ENOENT; - spin_unlock(&o2hb_live_lock); + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + uuid = config_item_name(®->hr_item); + if (region_uuid) { + if (strcmp(region_uuid, uuid)) + continue; + found = 1; + } - if (ret) - goto out; + if (reg->hr_item_pinned) { + mlog(ML_CLUSTER, "Unpin region %s\n", uuid); + o2nm_undepend_item(®->hr_item); + reg->hr_item_pinned = 0; + } + if (found) + break; + } +} - ret = o2nm_depend_this_node(); - if (ret) - goto out; +static int o2hb_region_inc_user(const char *region_uuid) +{ + int ret = 0; - ret = o2nm_depend_item(®->hr_item); - if (ret) - o2nm_undepend_this_node(); + spin_lock(&o2hb_live_lock); -out: + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + ret = o2hb_region_pin(region_uuid); + goto unlock; + } + + /* + * if global heartbeat active and this is the first dependent user, + * pin all regions if quorum region count <= CUT_OFF + */ + o2hb_dependent_users++; + if (o2hb_dependent_users > 1) + goto unlock; + + if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) + ret = o2hb_region_pin(NULL); + +unlock: + spin_unlock(&o2hb_live_lock); return ret; } -static void o2hb_region_put(const char *region_uuid) +void o2hb_region_dec_user(const char *region_uuid) { - struct o2hb_region *reg; - spin_lock(&o2hb_live_lock); - reg = o2hb_find_region(region_uuid); + /* local heartbeat */ + if (!o2hb_global_heartbeat_active()) { + o2hb_region_unpin(region_uuid); + goto unlock; + } - spin_unlock(&o2hb_live_lock); + /* + * if global heartbeat active and there are no dependent users, + * unpin all quorum regions + */ + o2hb_dependent_users--; + if (!o2hb_dependent_users) + o2hb_region_unpin(NULL); - if (reg) { - o2nm_undepend_item(®->hr_item); - o2nm_undepend_this_node(); - } +unlock: + spin_unlock(&o2hb_live_lock); } int o2hb_register_callback(const char *region_uuid, @@ -2292,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid, } if (region_uuid) { - ret = o2hb_region_get(region_uuid); - if (ret) + ret = o2hb_region_inc_user(region_uuid); + if (ret) { + mlog_errno(ret); goto out; + } } down_write(&o2hb_callback_sem); @@ -2312,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid, up_write(&o2hb_callback_sem); ret = 0; out: - mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", + mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n", ret, __builtin_return_address(0), hc); return ret; } @@ -2323,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid, { BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); - mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", + mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n", __builtin_return_address(0), hc); /* XXX Can this happen _with_ a region reference? */ @@ -2331,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid, return; if (region_uuid) - o2hb_region_put(region_uuid); + o2hb_region_dec_user(region_uuid); down_write(&o2hb_callback_sem); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index a3f150e52b0..3a5835904b3 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -46,10 +46,15 @@ #define O2NET_DEBUG_DIR "o2net" #define SC_DEBUG_NAME "sock_containers" #define NST_DEBUG_NAME "send_tracking" +#define STATS_DEBUG_NAME "stats" + +#define SHOW_SOCK_CONTAINERS 0 +#define SHOW_SOCK_STATS 1 static struct dentry *o2net_dentry; static struct dentry *sc_dentry; static struct dentry *nst_dentry; +static struct dentry *stats_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int nst_seq_show(struct seq_file *seq, void *v) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; + ktime_t now; + s64 sock, send, status; spin_lock(&o2net_debug_lock); nst = next_nst(dummy_nst); + if (!nst) + goto out; - if (nst != NULL) { - /* get_task_comm isn't exported. oh well. */ - seq_printf(seq, "%p:\n" - " pid: %lu\n" - " tgid: %lu\n" - " process name: %s\n" - " node: %u\n" - " sc: %p\n" - " message id: %d\n" - " message type: %u\n" - " message key: 0x%08x\n" - " sock acquiry: %lu.%ld\n" - " send start: %lu.%ld\n" - " wait start: %lu.%ld\n", - nst, (unsigned long)nst->st_task->pid, - (unsigned long)nst->st_task->tgid, - nst->st_task->comm, nst->st_node, - nst->st_sc, nst->st_id, nst->st_msg_type, - nst->st_msg_key, - nst->st_sock_time.tv_sec, - (long)nst->st_sock_time.tv_usec, - nst->st_send_time.tv_sec, - (long)nst->st_send_time.tv_usec, - nst->st_status_time.tv_sec, - (long)nst->st_status_time.tv_usec); - } + now = ktime_get(); + sock = ktime_to_us(ktime_sub(now, nst->st_sock_time)); + send = ktime_to_us(ktime_sub(now, nst->st_send_time)); + status = ktime_to_us(ktime_sub(now, nst->st_status_time)); + + /* get_task_comm isn't exported. oh well. */ + seq_printf(seq, "%p:\n" + " pid: %lu\n" + " tgid: %lu\n" + " process name: %s\n" + " node: %u\n" + " sc: %p\n" + " message id: %d\n" + " message type: %u\n" + " message key: 0x%08x\n" + " sock acquiry: %lld usecs ago\n" + " send start: %lld usecs ago\n" + " wait start: %lld usecs ago\n", + nst, (unsigned long)task_pid_nr(nst->st_task), + (unsigned long)nst->st_task->tgid, + nst->st_task->comm, nst->st_node, + nst->st_sc, nst->st_id, nst->st_msg_type, + nst->st_msg_key, + (long long)sock, + (long long)send, + (long long)status); +out: spin_unlock(&o2net_debug_lock); return 0; @@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc) spin_unlock(&o2net_debug_lock); } +struct o2net_sock_debug { + int dbg_ctxt; + struct o2net_sock_container *dbg_sock; +}; + static struct o2net_sock_container *next_sc(struct o2net_sock_container *sc_start) { @@ -253,7 +268,8 @@ static struct o2net_sock_container static void *sc_seq_start(struct seq_file *seq, loff_t *pos) { - struct o2net_sock_container *sc, *dummy_sc = seq->private; + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; spin_lock(&o2net_debug_lock); sc = next_sc(dummy_sc); @@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos) static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct o2net_sock_container *sc, *dummy_sc = seq->private; + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; spin_lock(&o2net_debug_lock); sc = next_sc(dummy_sc); @@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) return sc; /* unused, just needs to be null when done */ } -#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec +#ifdef CONFIG_OCFS2_FS_STATS +# define sc_send_count(_s) ((_s)->sc_send_count) +# define sc_recv_count(_s) ((_s)->sc_recv_count) +# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total)) +# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total)) +# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total)) +# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total)) +#else +# define sc_send_count(_s) (0U) +# define sc_recv_count(_s) (0U) +# define sc_tv_acquiry_total_ns(_s) (0LL) +# define sc_tv_send_total_ns(_s) (0LL) +# define sc_tv_status_total_ns(_s) (0LL) +# define sc_tv_process_total_ns(_s) (0LL) +#endif + +/* So that debugfs.ocfs2 can determine which format is being used */ +#define O2NET_STATS_STR_VERSION 1 +static void sc_show_sock_stats(struct seq_file *seq, + struct o2net_sock_container *sc) +{ + if (!sc) + return; + + seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION, + sc->sc_node->nd_num, (unsigned long)sc_send_count(sc), + (long long)sc_tv_acquiry_total_ns(sc), + (long long)sc_tv_send_total_ns(sc), + (long long)sc_tv_status_total_ns(sc), + (unsigned long)sc_recv_count(sc), + (long long)sc_tv_process_total_ns(sc)); +} + +static void sc_show_sock_container(struct seq_file *seq, + struct o2net_sock_container *sc) +{ + struct inet_sock *inet = NULL; + __be32 saddr = 0, daddr = 0; + __be16 sport = 0, dport = 0; + + if (!sc) + return; + + if (sc->sc_sock) { + inet = inet_sk(sc->sc_sock->sk); + /* the stack's structs aren't sparse endian clean */ + saddr = (__force __be32)inet->inet_saddr; + daddr = (__force __be32)inet->inet_daddr; + sport = (__force __be16)inet->inet_sport; + dport = (__force __be16)inet->inet_dport; + } + + /* XXX sigh, inet-> doesn't have sparse annotation so any + * use of it here generates a warning with -Wbitwise */ + seq_printf(seq, "%p:\n" + " krefs: %d\n" + " sock: %pI4:%u -> " + "%pI4:%u\n" + " remote node: %s\n" + " page off: %zu\n" + " handshake ok: %u\n" + " timer: %lld usecs\n" + " data ready: %lld usecs\n" + " advance start: %lld usecs\n" + " advance stop: %lld usecs\n" + " func start: %lld usecs\n" + " func stop: %lld usecs\n" + " func key: 0x%08x\n" + " func type: %u\n", + sc, + atomic_read(&sc->sc_kref.refcount), + &saddr, inet ? ntohs(sport) : 0, + &daddr, inet ? ntohs(dport) : 0, + sc->sc_node->nd_name, + sc->sc_page_off, + sc->sc_handshake_ok, + (long long)ktime_to_us(sc->sc_tv_timer), + (long long)ktime_to_us(sc->sc_tv_data_ready), + (long long)ktime_to_us(sc->sc_tv_advance_start), + (long long)ktime_to_us(sc->sc_tv_advance_stop), + (long long)ktime_to_us(sc->sc_tv_func_start), + (long long)ktime_to_us(sc->sc_tv_func_stop), + sc->sc_msg_key, + sc->sc_msg_type); +} static int sc_seq_show(struct seq_file *seq, void *v) { - struct o2net_sock_container *sc, *dummy_sc = seq->private; + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; spin_lock(&o2net_debug_lock); sc = next_sc(dummy_sc); - if (sc != NULL) { - struct inet_sock *inet = NULL; - - __be32 saddr = 0, daddr = 0; - __be16 sport = 0, dport = 0; - - if (sc->sc_sock) { - inet = inet_sk(sc->sc_sock->sk); - /* the stack's structs aren't sparse endian clean */ - saddr = (__force __be32)inet->inet_saddr; - daddr = (__force __be32)inet->inet_daddr; - sport = (__force __be16)inet->inet_sport; - dport = (__force __be16)inet->inet_dport; - } - - /* XXX sigh, inet-> doesn't have sparse annotation so any - * use of it here generates a warning with -Wbitwise */ - seq_printf(seq, "%p:\n" - " krefs: %d\n" - " sock: %pI4:%u -> " - "%pI4:%u\n" - " remote node: %s\n" - " page off: %zu\n" - " handshake ok: %u\n" - " timer: %lu.%ld\n" - " data ready: %lu.%ld\n" - " advance start: %lu.%ld\n" - " advance stop: %lu.%ld\n" - " func start: %lu.%ld\n" - " func stop: %lu.%ld\n" - " func key: %u\n" - " func type: %u\n", - sc, - atomic_read(&sc->sc_kref.refcount), - &saddr, inet ? ntohs(sport) : 0, - &daddr, inet ? ntohs(dport) : 0, - sc->sc_node->nd_name, - sc->sc_page_off, - sc->sc_handshake_ok, - TV_SEC_USEC(sc->sc_tv_timer), - TV_SEC_USEC(sc->sc_tv_data_ready), - TV_SEC_USEC(sc->sc_tv_advance_start), - TV_SEC_USEC(sc->sc_tv_advance_stop), - TV_SEC_USEC(sc->sc_tv_func_start), - TV_SEC_USEC(sc->sc_tv_func_stop), - sc->sc_msg_key, - sc->sc_msg_type); + if (sc) { + if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS) + sc_show_sock_container(seq, sc); + else + sc_show_sock_stats(seq, sc); } - spin_unlock(&o2net_debug_lock); return 0; @@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = { .show = sc_seq_show, }; -static int sc_fop_open(struct inode *inode, struct file *file) +static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) { struct o2net_sock_container *dummy_sc; struct seq_file *seq; @@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file) goto out; seq = file->private_data; - seq->private = dummy_sc; + seq->private = sd; + sd->dbg_sock = dummy_sc; o2net_debug_add_sc(dummy_sc); dummy_sc = NULL; @@ -382,12 +442,48 @@ out: static int sc_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - struct o2net_sock_container *dummy_sc = seq->private; + struct o2net_sock_debug *sd = seq->private; + struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); return seq_release_private(inode, file); } +static int stats_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_sock_debug *sd; + + sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); + if (sd == NULL) + return -ENOMEM; + + sd->dbg_ctxt = SHOW_SOCK_STATS; + sd->dbg_sock = NULL; + + return sc_common_open(file, sd); +} + +static const struct file_operations stats_seq_fops = { + .open = stats_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sc_fop_release, +}; + +static int sc_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_sock_debug *sd; + + sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); + if (sd == NULL) + return -ENOMEM; + + sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; + sd->dbg_sock = NULL; + + return sc_common_open(file, sd); +} + static const struct file_operations sc_seq_fops = { .open = sc_fop_open, .read = seq_read, @@ -419,25 +515,29 @@ int o2net_debugfs_init(void) goto bail; } + stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, + o2net_dentry, NULL, + &stats_seq_fops); + if (!stats_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + return 0; bail: - if (sc_dentry) - debugfs_remove(sc_dentry); - if (nst_dentry) - debugfs_remove(nst_dentry); - if (o2net_dentry) - debugfs_remove(o2net_dentry); + debugfs_remove(stats_dentry); + debugfs_remove(sc_dentry); + debugfs_remove(nst_dentry); + debugfs_remove(o2net_dentry); return -ENOMEM; } void o2net_debugfs_exit(void) { - if (sc_dentry) - debugfs_remove(sc_dentry); - if (nst_dentry) - debugfs_remove(nst_dentry); - if (o2net_dentry) - debugfs_remove(o2net_dentry); + debugfs_remove(stats_dentry); + debugfs_remove(sc_dentry); + debugfs_remove(nst_dentry); + debugfs_remove(o2net_dentry); } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index cf3e1669621..a87366750f2 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -325,5 +325,7 @@ void o2quo_init(void) void o2quo_exit(void) { - flush_scheduled_work(); + struct o2quo_state *qs = &o2quo_state; + + flush_work_sync(&qs->qs_work); } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 9aa426e4212..3b11cb1e38f 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, nst->st_node = node; } -static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) { - do_gettimeofday(&nst->st_sock_time); + nst->st_sock_time = ktime_get(); } -static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) { - do_gettimeofday(&nst->st_send_time); + nst->st_send_time = ktime_get(); } -static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) { - do_gettimeofday(&nst->st_status_time); + nst->st_status_time = ktime_get(); } -static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc) +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) { nst->st_sc = sc; } -static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) { nst->st_id = msg_id; } -#else /* CONFIG_DEBUG_FS */ - -static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) +static inline void o2net_set_sock_timer(struct o2net_sock_container *sc) { + sc->sc_tv_timer = ktime_get(); } -static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc) { + sc->sc_tv_data_ready = ktime_get(); } -static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc) { + sc->sc_tv_advance_start = ktime_get(); } -static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc) { + sc->sc_tv_advance_stop = ktime_get(); } -static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc) +static inline void o2net_set_func_start_time(struct o2net_sock_container *sc) { + sc->sc_tv_func_start = ktime_get(); } -static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, - u32 msg_id) +static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc) { + sc->sc_tv_func_stop = ktime_get(); } +static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) +{ + return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); +} +#else /* CONFIG_DEBUG_FS */ +# define o2net_init_nst(a, b, c, d, e) +# define o2net_set_nst_sock_time(a) +# define o2net_set_nst_send_time(a) +# define o2net_set_nst_status_time(a) +# define o2net_set_nst_sock_container(a, b) +# define o2net_set_nst_msg_id(a, b) +# define o2net_set_sock_timer(a) +# define o2net_set_data_ready_time(a) +# define o2net_set_advance_start_time(a) +# define o2net_set_advance_stop_time(a) +# define o2net_set_func_start_time(a) +# define o2net_set_func_stop_time(a) +# define o2net_get_func_run_time(a) (ktime_t)0 #endif /* CONFIG_DEBUG_FS */ +#ifdef CONFIG_OCFS2_FS_STATS +static void o2net_update_send_stats(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ + sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total, + ktime_sub(ktime_get(), + nst->st_status_time)); + sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total, + ktime_sub(nst->st_status_time, + nst->st_send_time)); + sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total, + ktime_sub(nst->st_send_time, + nst->st_sock_time)); + sc->sc_send_count++; +} + +static void o2net_update_recv_stats(struct o2net_sock_container *sc) +{ + sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total, + o2net_get_func_run_time(sc)); + sc->sc_recv_count++; +} + +#else + +# define o2net_update_send_stats(a, b) + +# define o2net_update_recv_stats(sc) + +#endif /* CONFIG_OCFS2_FS_STATS */ + static inline int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; @@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref) sc->sc_sock = NULL; } + o2nm_undepend_item(&sc->sc_node->nd_item); o2nm_node_put(sc->sc_node); sc->sc_node = NULL; @@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) { struct o2net_sock_container *sc, *ret = NULL; struct page *page = NULL; + int status = 0; page = alloc_page(GFP_NOFS); sc = kzalloc(sizeof(*sc), GFP_NOFS); @@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) o2nm_node_get(node); sc->sc_node = node; + /* pin the node item of the remote node */ + status = o2nm_depend_item(&node->nd_item); + if (status) { + mlog_errno(status); + o2nm_node_put(node); + goto out; + } INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); @@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes) if (sk->sk_user_data) { struct o2net_sock_container *sc = sk->sk_user_data; sclog(sc, "data_ready hit\n"); - do_gettimeofday(&sc->sc_tv_data_ready); + o2net_set_data_ready_time(sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); ready = sc->sc_data_ready; } else { @@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_status_time(&nst); wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); + o2net_update_send_stats(&nst, sc); + /* Note that we avoid overwriting the callers status return * variable if a system error was reported on the other * side. Callers beware. */ @@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc, if (syserr != O2NET_ERR_NONE) goto out_respond; - do_gettimeofday(&sc->sc_tv_func_start); + o2net_set_func_start_time(sc); sc->sc_msg_key = be32_to_cpu(hdr->key); sc->sc_msg_type = be16_to_cpu(hdr->msg_type); handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len), nmh->nh_func_data, &ret_data); - do_gettimeofday(&sc->sc_tv_func_stop); + o2net_set_func_stop_time(sc); + + o2net_update_recv_stats(sc); out_respond: /* this destroys the hdr, so don't use it after this */ @@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc) size_t datalen; sclog(sc, "receiving\n"); - do_gettimeofday(&sc->sc_tv_advance_start); + o2net_set_advance_start_time(sc); if (unlikely(sc->sc_handshake_ok == 0)) { if(sc->sc_page_off < sizeof(struct o2net_handshake)) { @@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc) out: sclog(sc, "ret = %d\n", ret); - do_gettimeofday(&sc->sc_tv_advance_stop); + o2net_set_advance_stop_time(sc); return ret; } @@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data) { struct o2net_sock_container *sc = (struct o2net_sock_container *)data; struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); - struct timeval now; - do_gettimeofday(&now); +#ifdef CONFIG_DEBUG_FS + ktime_t now = ktime_get(); +#endif printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); - mlog(ML_NOTICE, "here are some times that might help debug the " - "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " - "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", - sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, - now.tv_sec, (long) now.tv_usec, - sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, - sc->sc_tv_advance_start.tv_sec, - (long) sc->sc_tv_advance_start.tv_usec, - sc->sc_tv_advance_stop.tv_sec, - (long) sc->sc_tv_advance_stop.tv_usec, + +#ifdef CONFIG_DEBUG_FS + mlog(ML_NOTICE, "Here are some times that might help debug the " + "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " + "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", + (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), + (long long)ktime_to_us(sc->sc_tv_data_ready), + (long long)ktime_to_us(sc->sc_tv_advance_start), + (long long)ktime_to_us(sc->sc_tv_advance_stop), sc->sc_msg_key, sc->sc_msg_type, - sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, - sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); + (long long)ktime_to_us(sc->sc_tv_func_start), + (long long)ktime_to_us(sc->sc_tv_func_stop)); +#endif /* * Initialize the nn_timeout so that the next connection attempt @@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, msecs_to_jiffies(o2net_keepalive_delay())); - do_gettimeofday(&sc->sc_tv_timer); + o2net_set_sock_timer(sc); mod_timer(&sc->sc_idle_timeout, jiffies + msecs_to_jiffies(o2net_idle_timeout())); } diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 15fdbdf9eb4..4cbcb65784a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -166,18 +166,27 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); void (*sc_data_ready)(struct sock *sk, int bytes); -#ifdef CONFIG_DEBUG_FS - struct list_head sc_net_debug_item; -#endif - struct timeval sc_tv_timer; - struct timeval sc_tv_data_ready; - struct timeval sc_tv_advance_start; - struct timeval sc_tv_advance_stop; - struct timeval sc_tv_func_start; - struct timeval sc_tv_func_stop; + u32 sc_msg_key; u16 sc_msg_type; +#ifdef CONFIG_DEBUG_FS + struct list_head sc_net_debug_item; + ktime_t sc_tv_timer; + ktime_t sc_tv_data_ready; + ktime_t sc_tv_advance_start; + ktime_t sc_tv_advance_stop; + ktime_t sc_tv_func_start; + ktime_t sc_tv_func_stop; +#endif +#ifdef CONFIG_OCFS2_FS_STATS + ktime_t sc_tv_acquiry_total; + ktime_t sc_tv_send_total; + ktime_t sc_tv_status_total; + u32 sc_send_count; + u32 sc_recv_count; + ktime_t sc_tv_process_total; +#endif struct mutex sc_send_lock; }; @@ -220,9 +229,9 @@ struct o2net_send_tracking { u32 st_msg_type; u32 st_msg_key; u8 st_node; - struct timeval st_sock_time; - struct timeval st_send_time; - struct timeval st_status_time; + ktime_t st_sock_time; + ktime_t st_send_time; + ktime_t st_status_time; }; #else struct o2net_send_tracking { diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 895532ac4d9..6d80ecc7834 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) static int ocfs2_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode; int ret = 0; /* if all else fails, just return false */ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + osb = OCFS2_SB(dentry->d_sb); mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -169,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, struct list_head *p; struct dentry *dentry = NULL; - spin_lock(&dcache_lock); - + spin_lock(&inode->i_lock); list_for_each(p, &inode->i_dentry) { dentry = list_entry(p, struct dentry, d_alias); + spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { mlog(0, "dentry found: %.*s\n", dentry->d_name.len, dentry->d_name.name); - dget_locked(dentry); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); break; } + spin_unlock(&dentry->d_lock); dentry = NULL; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); return dentry; } diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index f4499915683..3a3ed4bb794 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { - mlog_entry_void(); + struct dlm_lock_resource *res; BUG_ON(!dlm); BUG_ON(!lock); + res = lock->lockres; + assert_spin_locked(&dlm->ast_lock); + if (!list_empty(&lock->ast_list)) { - mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", + mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, " + "AST list not empty, pending %d, newlevel %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), lock->ast_pending, lock->ml.type); BUG(); } if (lock->ast_pending) - mlog(0, "lock has an ast getting flushed right now\n"); + mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); /* putting lock on list, add a ref */ dlm_lock_get(lock); @@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) /* check to see if this ast obsoletes the bast */ if (dlm_should_cancel_bast(dlm, lock)) { - struct dlm_lock_resource *res = lock->lockres; - mlog(0, "%s: cancelling bast for %.*s\n", - dlm->name, res->lockname.len, res->lockname.name); + mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); lock->bast_pending = 0; list_del_init(&lock->bast_list); lock->ml.highest_blocked = LKM_IVMODE; @@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { - mlog_entry_void(); - BUG_ON(!dlm); BUG_ON(!lock); @@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { - mlog_entry_void(); + struct dlm_lock_resource *res; BUG_ON(!dlm); BUG_ON(!lock); + assert_spin_locked(&dlm->ast_lock); + res = lock->lockres; + BUG_ON(!list_empty(&lock->bast_list)); if (lock->bast_pending) - mlog(0, "lock has a bast getting flushed right now\n"); + mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); /* putting lock on list, add a ref */ dlm_lock_get(lock); @@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { - mlog_entry_void(); - BUG_ON(!dlm); BUG_ON(!lock); @@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, dlm_astlockfunc_t *fn; struct dlm_lockstatus *lksb; - mlog_entry_void(); + mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); lksb = lock->lksb; fn = lock->ast; @@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lockstatus *lksb; int lksbflags; - mlog_entry_void(); + mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); lksb = lock->lksb; BUG_ON(lock->ml.node == dlm->node_num); @@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, { dlm_bastlockfunc_t *fn = lock->bast; - mlog_entry_void(); BUG_ON(lock->ml.node != dlm->node_num); + mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + blocked_type); + (*fn)(lock->astdata, blocked_type); } @@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, /* cannot get a proxy ast message if this node owns it */ BUG_ON(res->owner == dlm->node_num); - mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); + mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -382,8 +407,12 @@ do_ast: if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, &res->granted); - mlog(0, "ast: Adding to granted list... type=%d, " - "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); + mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + lock->ml.type, lock->ml.convert_type); + if (lock->ml.convert_type != LKM_IVMODE) { lock->ml.type = lock->ml.convert_type; lock->ml.convert_type = LKM_IVMODE; @@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, size_t veclen = 1; int status; - mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", - res->lockname.len, res->lockname.name, lock->ml.node, - msg_type, blocked_type); + mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name, + res->lockname.len, res->lockname.name, lock->ml.node, msg_type, + blocked_type); memset(&past, 0, sizeof(struct dlm_proxy_ast)); past.node_idx = dlm->node_num; @@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, vec[0].iov_len = sizeof(struct dlm_proxy_ast); vec[0].iov_base = &past; if (flags & DLM_LKSB_GET_LVB) { - mlog(0, "returning requested LVB data\n"); be32_add_cpu(&past.flags, LKM_GET_LVB); vec[1].iov_len = DLM_LVB_LEN; vec[1].iov_base = lock->lksb->lvb; @@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, lock->ml.node, &status); if (ret < 0) - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " - "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key, + mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n", + dlm->name, res->lockname.len, res->lockname.name, ret, lock->ml.node); else { if (status == DLM_RECOVERING) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index b36d0bf77a5..4bdf7baee34 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -50,10 +50,10 @@ #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) enum dlm_mle_type { - DLM_MLE_BLOCK, - DLM_MLE_MASTER, - DLM_MLE_MIGRATION, - DLM_MLE_NUM_TYPES + DLM_MLE_BLOCK = 0, + DLM_MLE_MASTER = 1, + DLM_MLE_MIGRATION = 2, + DLM_MLE_NUM_TYPES = 3, }; struct dlm_master_list_entry { @@ -82,8 +82,8 @@ struct dlm_master_list_entry { enum dlm_ast_type { DLM_AST = 0, - DLM_BAST, - DLM_ASTUNLOCK + DLM_BAST = 1, + DLM_ASTUNLOCK = 2, }; @@ -119,9 +119,9 @@ struct dlm_recovery_ctxt enum dlm_ctxt_state { DLM_CTXT_NEW = 0, - DLM_CTXT_JOINED, - DLM_CTXT_IN_SHUTDOWN, - DLM_CTXT_LEAVING, + DLM_CTXT_JOINED = 1, + DLM_CTXT_IN_SHUTDOWN = 2, + DLM_CTXT_LEAVING = 3, }; struct dlm_ctxt @@ -388,8 +388,8 @@ struct dlm_lock enum dlm_lockres_list { DLM_GRANTED_LIST = 0, - DLM_CONVERTING_LIST, - DLM_BLOCKED_LIST + DLM_CONVERTING_LIST = 1, + DLM_BLOCKED_LIST = 2, }; static inline int dlm_lvb_is_empty(char *lvb) @@ -427,27 +427,27 @@ struct dlm_node_iter enum { - DLM_MASTER_REQUEST_MSG = 500, - DLM_UNUSED_MSG1, /* 501 */ - DLM_ASSERT_MASTER_MSG, /* 502 */ - DLM_CREATE_LOCK_MSG, /* 503 */ - DLM_CONVERT_LOCK_MSG, /* 504 */ - DLM_PROXY_AST_MSG, /* 505 */ - DLM_UNLOCK_LOCK_MSG, /* 506 */ - DLM_DEREF_LOCKRES_MSG, /* 507 */ - DLM_MIGRATE_REQUEST_MSG, /* 508 */ - DLM_MIG_LOCKRES_MSG, /* 509 */ - DLM_QUERY_JOIN_MSG, /* 510 */ - DLM_ASSERT_JOINED_MSG, /* 511 */ - DLM_CANCEL_JOIN_MSG, /* 512 */ - DLM_EXIT_DOMAIN_MSG, /* 513 */ - DLM_MASTER_REQUERY_MSG, /* 514 */ - DLM_LOCK_REQUEST_MSG, /* 515 */ - DLM_RECO_DATA_DONE_MSG, /* 516 */ - DLM_BEGIN_RECO_MSG, /* 517 */ - DLM_FINALIZE_RECO_MSG, /* 518 */ - DLM_QUERY_REGION, /* 519 */ - DLM_QUERY_NODEINFO, /* 520 */ + DLM_MASTER_REQUEST_MSG = 500, + DLM_UNUSED_MSG1 = 501, + DLM_ASSERT_MASTER_MSG = 502, + DLM_CREATE_LOCK_MSG = 503, + DLM_CONVERT_LOCK_MSG = 504, + DLM_PROXY_AST_MSG = 505, + DLM_UNLOCK_LOCK_MSG = 506, + DLM_DEREF_LOCKRES_MSG = 507, + DLM_MIGRATE_REQUEST_MSG = 508, + DLM_MIG_LOCKRES_MSG = 509, + DLM_QUERY_JOIN_MSG = 510, + DLM_ASSERT_JOINED_MSG = 511, + DLM_CANCEL_JOIN_MSG = 512, + DLM_EXIT_DOMAIN_MSG = 513, + DLM_MASTER_REQUERY_MSG = 514, + DLM_LOCK_REQUEST_MSG = 515, + DLM_RECO_DATA_DONE_MSG = 516, + DLM_BEGIN_RECO_MSG = 517, + DLM_FINALIZE_RECO_MSG = 518, + DLM_QUERY_REGION = 519, + DLM_QUERY_NODEINFO = 520, }; struct dlm_reco_node_data @@ -460,19 +460,19 @@ struct dlm_reco_node_data enum { DLM_RECO_NODE_DATA_DEAD = -1, DLM_RECO_NODE_DATA_INIT = 0, - DLM_RECO_NODE_DATA_REQUESTING, - DLM_RECO_NODE_DATA_REQUESTED, - DLM_RECO_NODE_DATA_RECEIVING, - DLM_RECO_NODE_DATA_DONE, - DLM_RECO_NODE_DATA_FINALIZE_SENT, + DLM_RECO_NODE_DATA_REQUESTING = 1, + DLM_RECO_NODE_DATA_REQUESTED = 2, + DLM_RECO_NODE_DATA_RECEIVING = 3, + DLM_RECO_NODE_DATA_DONE = 4, + DLM_RECO_NODE_DATA_FINALIZE_SENT = 5, }; enum { DLM_MASTER_RESP_NO = 0, - DLM_MASTER_RESP_YES, - DLM_MASTER_RESP_MAYBE, - DLM_MASTER_RESP_ERROR + DLM_MASTER_RESP_YES = 1, + DLM_MASTER_RESP_MAYBE = 2, + DLM_MASTER_RESP_ERROR = 3, }; @@ -649,9 +649,9 @@ struct dlm_proxy_ast #define DLM_MOD_KEY (0x666c6172) enum dlm_query_join_response_code { JOIN_DISALLOW = 0, - JOIN_OK, - JOIN_OK_NO_MAP, - JOIN_PROTOCOL_MISMATCH, + JOIN_OK = 1, + JOIN_OK_NO_MAP = 2, + JOIN_PROTOCOL_MISMATCH = 3, }; struct dlm_query_join_packet { diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 272ec8631a5..04a32be0aeb 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc) kref_get(&dc->debug_refcnt); } -static struct debug_buffer *debug_buffer_allocate(void) +static int debug_release(struct inode *inode, struct file *file) { - struct debug_buffer *db = NULL; - - db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL); - if (!db) - goto bail; - - db->len = PAGE_SIZE; - db->buf = kmalloc(db->len, GFP_KERNEL); - if (!db->buf) - goto bail; - - return db; -bail: - kfree(db); - return NULL; -} - -static ssize_t debug_buffer_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct debug_buffer *db = file->private_data; - - return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len); -} - -static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence) -{ - struct debug_buffer *db = file->private_data; - loff_t new = -1; - - switch (whence) { - case 0: - new = off; - break; - case 1: - new = file->f_pos + off; - break; - } - - if (new < 0 || new > db->len) - return -EINVAL; - - return (file->f_pos = new); + free_page((unsigned long)file->private_data); + return 0; } -static int debug_buffer_release(struct inode *inode, struct file *file) +static ssize_t debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) { - struct debug_buffer *db = file->private_data; - - if (db) - kfree(db->buf); - kfree(db); - - return 0; + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); } /* end - util funcs */ /* begin - purge list funcs */ -static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) +static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) { struct dlm_lock_resource *res; int out = 0; unsigned long total = 0; - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Dumping Purgelist for Domain: %s\n", dlm->name); spin_lock(&dlm->spinlock); list_for_each_entry(res, &dlm->purge_list, purge) { ++total; - if (db->len - out < 100) + if (len - out < 100) continue; spin_lock(&res->spinlock); out += stringify_lockname(res->lockname.name, res->lockname.len, - db->buf + out, db->len - out); - out += snprintf(db->buf + out, db->len - out, "\t%ld\n", + buf + out, len - out); + out += snprintf(buf + out, len - out, "\t%ld\n", (jiffies - res->last_used)/HZ); spin_unlock(&res->spinlock); } spin_unlock(&dlm->spinlock); - out += snprintf(db->buf + out, db->len - out, - "Total on list: %ld\n", total); + out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); return out; } @@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) static int debug_purgelist_open(struct inode *inode, struct file *file) { struct dlm_ctxt *dlm = inode->i_private; - struct debug_buffer *db; + char *buf = NULL; - db = debug_buffer_allocate(); - if (!db) + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) goto bail; - db->len = debug_purgelist_print(dlm, db); + i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1)); - file->private_data = db; + file->private_data = buf; return 0; bail: @@ -480,14 +434,14 @@ bail: static const struct file_operations debug_purgelist_fops = { .open = debug_purgelist_open, - .release = debug_buffer_release, - .read = debug_buffer_read, - .llseek = debug_buffer_llseek, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, }; /* end - purge list funcs */ /* begin - debug mle funcs */ -static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) +static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) { struct dlm_master_list_entry *mle; struct hlist_head *bucket; @@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) int i, out = 0; unsigned long total = 0, longest = 0, bucket_count = 0; - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Dumping MLEs for Domain: %s\n", dlm->name); spin_lock(&dlm->master_lock); @@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) master_hash_node); ++total; ++bucket_count; - if (db->len - out < 200) + if (len - out < 200) continue; - out += dump_mle(mle, db->buf + out, db->len - out); + out += dump_mle(mle, buf + out, len - out); } longest = max(longest, bucket_count); bucket_count = 0; } spin_unlock(&dlm->master_lock); - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Total: %ld, Longest: %ld\n", total, longest); return out; } @@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) static int debug_mle_open(struct inode *inode, struct file *file) { struct dlm_ctxt *dlm = inode->i_private; - struct debug_buffer *db; + char *buf = NULL; - db = debug_buffer_allocate(); - if (!db) + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) goto bail; - db->len = debug_mle_print(dlm, db); + i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1)); - file->private_data = db; + file->private_data = buf; return 0; bail: @@ -540,9 +494,9 @@ bail: static const struct file_operations debug_mle_fops = { .open = debug_mle_open, - .release = debug_buffer_release, - .read = debug_buffer_read, - .llseek = debug_buffer_llseek, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, }; /* end - debug mle funcs */ @@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = { /* end - debug lockres funcs */ /* begin - debug state funcs */ -static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) +static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) { int out = 0; struct dlm_reco_node_data *node; @@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) } /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Domain: %s Key: 0x%08x Protocol: %d.%d\n", dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); /* Thread Pid: xxx Node: xxx State: xxxxx */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Thread Pid: %d Node: %d State: %s\n", - dlm->dlm_thread_task->pid, dlm->node_num, state); + task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state); /* Number of Joins: xxx Joining Node: xxx */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Number of Joins: %d Joining Node: %d\n", dlm->num_joins, dlm->joining_node); /* Domain Map: xx xx xx */ - out += snprintf(db->buf + out, db->len - out, "Domain Map: "); + out += snprintf(buf + out, len - out, "Domain Map: "); out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, - db->buf + out, db->len - out); - out += snprintf(db->buf + out, db->len - out, "\n"); + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); /* Live Map: xx xx xx */ - out += snprintf(db->buf + out, db->len - out, "Live Map: "); + out += snprintf(buf + out, len - out, "Live Map: "); out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, - db->buf + out, db->len - out); - out += snprintf(db->buf + out, db->len - out, "\n"); + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); /* Lock Resources: xxx (xxx) */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Lock Resources: %d (%d)\n", atomic_read(&dlm->res_cur_count), atomic_read(&dlm->res_tot_count)); @@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) cur_mles += atomic_read(&dlm->mle_cur_count[i]); /* MLEs: xxx (xxx) */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "MLEs: %d (%d)\n", cur_mles, tot_mles); /* Blocking: xxx (xxx) */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, " Blocking: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); /* Mastery: xxx (xxx) */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, " Mastery: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); /* Migration: xxx (xxx) */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, " Migration: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Lists: Dirty=%s Purge=%s PendingASTs=%s " "PendingBASTs=%s\n", (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), @@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); /* Purge Count: xxx Refs: xxx */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Purge Count: %d Refs: %d\n", dlm->purge_count, atomic_read(&dlm->dlm_refs.refcount)); /* Dead Node: xxx */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Dead Node: %d\n", dlm->reco.dead_node); /* What about DLM_RECO_STATE_FINALIZE? */ @@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) state = "INACTIVE"; /* Recovery Pid: xxxx Master: xxx State: xxxx */ - out += snprintf(db->buf + out, db->len - out, + out += snprintf(buf + out, len - out, "Recovery Pid: %d Master: %d State: %s\n", - dlm->dlm_reco_thread_task->pid, + task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, state); /* Recovery Map: xx xx */ - out += snprintf(db->buf + out, db->len - out, "Recovery Map: "); + out += snprintf(buf + out, len - out, "Recovery Map: "); out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, - db->buf + out, db->len - out); - out += snprintf(db->buf + out, db->len - out, "\n"); + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); /* Recovery Node State: */ - out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n"); + out += snprintf(buf + out, len - out, "Recovery Node State:\n"); list_for_each_entry(node, &dlm->reco.node_data, list) { switch (node->state) { case DLM_RECO_NODE_DATA_INIT: @@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) state = "BAD"; break; } - out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n", + out += snprintf(buf + out, len - out, "\t%u - %s\n", node->node_num, state); } @@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) static int debug_state_open(struct inode *inode, struct file *file) { struct dlm_ctxt *dlm = inode->i_private; - struct debug_buffer *db = NULL; + char *buf = NULL; - db = debug_buffer_allocate(); - if (!db) + buf = (char *) get_zeroed_page(GFP_NOFS); + if (!buf) goto bail; - db->len = debug_state_print(dlm, db); + i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1)); - file->private_data = db; + file->private_data = buf; return 0; bail: @@ -936,9 +890,9 @@ bail: static const struct file_operations debug_state_fops = { .open = debug_state_open, - .release = debug_buffer_release, - .read = debug_buffer_read, - .llseek = debug_buffer_llseek, + .release = debug_release, + .read = debug_read, + .llseek = generic_file_llseek, }; /* end - debug state funcs */ @@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm) struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; if (dc) { - if (dc->debug_purgelist_dentry) - debugfs_remove(dc->debug_purgelist_dentry); - if (dc->debug_mle_dentry) - debugfs_remove(dc->debug_mle_dentry); - if (dc->debug_lockres_dentry) - debugfs_remove(dc->debug_lockres_dentry); - if (dc->debug_state_dentry) - debugfs_remove(dc->debug_state_dentry); + debugfs_remove(dc->debug_purgelist_dentry); + debugfs_remove(dc->debug_mle_dentry); + debugfs_remove(dc->debug_lockres_dentry); + debugfs_remove(dc->debug_state_dentry); dlm_debug_put(dc); } } @@ -1040,8 +990,7 @@ bail: void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { - if (dlm->dlm_debugfs_subroot) - debugfs_remove(dlm->dlm_debugfs_subroot); + debugfs_remove(dlm->dlm_debugfs_subroot); } /* debugfs root */ @@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void) void dlm_destroy_debugfs_root(void) { - if (dlm_debugfs_root) - debugfs_remove(dlm_debugfs_root); + debugfs_remove(dlm_debugfs_root); } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 8c686d22f9c..1f27c4812d1 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -37,11 +37,6 @@ struct dlm_debug_ctxt { struct dentry *debug_purgelist_dentry; }; -struct debug_buffer { - int len; - char *buf; -}; - struct debug_lockres { int dl_len; char *dl_buf; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index cc2aaa96cfe..7e38a072d72 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -460,8 +460,6 @@ redo_bucket: } cond_resched_lock(&dlm->spinlock); num += n; - mlog(0, "%s: touched %d lockreses in bucket %d " - "(tot=%d)\n", dlm->name, n, i, num); } spin_unlock(&dlm->spinlock); wake_up(&dlm->dlm_thread_wq); @@ -1661,8 +1659,8 @@ bail: static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) { - o2hb_unregister_callback(NULL, &dlm->dlm_hb_up); - o2hb_unregister_callback(NULL, &dlm->dlm_hb_down); + o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up); + o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down); o2net_unregister_handler_list(&dlm->dlm_domain_handlers); } @@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); - status = o2hb_register_callback(NULL, &dlm->dlm_hb_down); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); if (status) goto bail; o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); - status = o2hb_register_callback(NULL, &dlm->dlm_hb_up); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); if (status) goto bail; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 69cf369961c..7009292aac5 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; + if (!dlm_lock_compatible(tmplock->ml.convert_type, + lock->ml.type)) + return 0; } return 1; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 2211acf33d9..1d6d1d22c47 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); - assert_spin_locked(&dlm->spinlock); assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ if (list_empty(&res->purge)) { - mlog(0, "putting lockres %.*s:%p onto purge list\n", - res->lockname.len, res->lockname.name, res); + mlog(0, "%s: Adding res %.*s to purge list\n", + dlm->name, res->lockname.len, res->lockname.name); res->last_used = jiffies; dlm_lockres_get(res); @@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, dlm->purge_count++; } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", - res->lockname.len, res->lockname.name, res, res->owner); + mlog(0, "%s: Removing res %.*s from purge list\n", + dlm->name, res->lockname.len, res->lockname.name); list_del_init(&res->purge); dlm_lockres_put(res); @@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); spin_lock(&dlm->spinlock); spin_lock(&res->spinlock); @@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, master = (res->owner == dlm->node_num); - - mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, - res->lockname.name, master); + mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name, + res->lockname.len, res->lockname.name, master); if (!master) { res->state |= DLM_LOCK_RES_DROPPING_REF; @@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, /* clear our bit from the master's refmap, ignore errors */ ret = dlm_drop_lockres_ref(dlm, res); if (ret < 0) { - mlog_errno(ret); + mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, + res->lockname.len, res->lockname.name, ret); if (!dlm_is_host_down(ret)) BUG(); } - mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", - dlm->name, res->lockname.len, res->lockname.name, ret); spin_lock(&dlm->spinlock); spin_lock(&res->spinlock); } if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s:%p from purgelist, " - "master = %d\n", res->lockname.len, res->lockname.name, - res, master); + mlog(0, "%s: Removing res %.*s from purgelist, master %d\n", + dlm->name, res->lockname.len, res->lockname.name, master); list_del_init(&res->purge); dlm_lockres_put(res); dlm->purge_count--; } if (!__dlm_lockres_unused(res)) { - mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", dlm->name, res->lockname.len, res->lockname.name); __dlm_print_one_lock_resource(res); BUG(); @@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, unused = __dlm_lockres_unused(lockres); if (!unused || (lockres->state & DLM_LOCK_RES_MIGRATING)) { - mlog(0, "lockres %s:%.*s: is in use or " - "being remastered, used %d, state %d\n", - dlm->name, lockres->lockname.len, - lockres->lockname.name, !unused, lockres->state); + mlog(0, "%s: res %.*s is in use or being remastered, " + "used %d, state %d\n", dlm->name, + lockres->lockname.len, lockres->lockname.name, + !unused, lockres->state); list_move_tail(&dlm->purge_list, &lockres->purge); spin_unlock(&lockres->spinlock); continue; @@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, struct list_head *head; int can_grant = 1; - //mlog(0, "res->lockname.len=%d\n", res->lockname.len); - //mlog(0, "res->lockname.name=%p\n", res->lockname.name); - //mlog(0, "shuffle res %.*s\n", res->lockname.len, - // res->lockname.name); - - /* because this function is called with the lockres + /* + * Because this function is called with the lockres * spinlock, and because we know that it is not migrating/ * recovering/in-progress, it is fine to reserve asts and - * basts right before queueing them all throughout */ + * basts right before queueing them all throughout + */ assert_spin_locked(&dlm->ast_lock); assert_spin_locked(&res->spinlock); BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| @@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, converting: if (list_empty(&res->converting)) goto blocked; - mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len, - res->lockname.name); + mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name, + res->lockname.len, res->lockname.name); target = list_entry(res->converting.next, struct dlm_lock, list); if (target->ml.convert_type == LKM_IVMODE) { - mlog(ML_ERROR, "%.*s: converting a lock with no " - "convert_type!\n", res->lockname.len, res->lockname.name); + mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n", + dlm->name, res->lockname.len, res->lockname.name); BUG(); } head = &res->granted; @@ -365,9 +356,12 @@ converting: spin_lock(&target->spinlock); BUG_ON(target->ml.highest_blocked != LKM_IVMODE); - mlog(0, "calling ast for converting lock: %.*s, have: %d, " - "granting: %d, node: %u\n", res->lockname.len, - res->lockname.name, target->ml.type, + mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type " + "%d => %d, node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)), + target->ml.type, target->ml.convert_type, target->ml.node); target->ml.type = target->ml.convert_type; @@ -428,11 +422,14 @@ blocked: spin_lock(&target->spinlock); BUG_ON(target->ml.highest_blocked != LKM_IVMODE); - mlog(0, "calling ast for blocked lock: %.*s, granting: %d, " - "node: %u\n", res->lockname.len, res->lockname.name, + mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, " + "node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)), target->ml.type, target->ml.node); - // target->ml.type is already correct + /* target->ml.type is already correct */ list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); @@ -453,7 +450,6 @@ leave: /* must have NO locks when calling this with res !=NULL * */ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - mlog_entry("dlm=%p, res=%p\n", dlm, res); if (res) { spin_lock(&dlm->spinlock); spin_lock(&res->spinlock); @@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - mlog_entry("dlm=%p, res=%p\n", dlm, res); - assert_spin_locked(&dlm->spinlock); assert_spin_locked(&res->spinlock); @@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) res->state |= DLM_LOCK_RES_DIRTY; } } + + mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); } /* Launch the NM thread for the mounted volume */ int dlm_launch_thread(struct dlm_ctxt *dlm) { - mlog(0, "starting dlm thread...\n"); + mlog(0, "Starting dlm_thread...\n"); dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); if (IS_ERR(dlm->dlm_thread_task)) { @@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) void dlm_complete_thread(struct dlm_ctxt *dlm) { if (dlm->dlm_thread_task) { - mlog(ML_KTHREAD, "waiting for dlm thread to exit\n"); + mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n"); kthread_stop(dlm->dlm_thread_task); dlm->dlm_thread_task = NULL; } @@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) /* get an extra ref on lock */ dlm_lock_get(lock); res = lock->lockres; - mlog(0, "delivering an ast for this lockres\n"); + mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, " + "node %u\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.node); BUG_ON(!lock->ast_pending); @@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) /* possible that another ast was queued while * we were delivering the last one */ if (!list_empty(&lock->ast_list)) { - mlog(0, "aha another ast got queued while " - "we were finishing the last one. will " - "keep the ast_pending flag set.\n"); + mlog(0, "%s: res %.*s, AST queued while flushing last " + "one\n", dlm->name, res->lockname.len, + res->lockname.name); } else lock->ast_pending = 0; @@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) dlm_lock_put(lock); spin_unlock(&dlm->ast_lock); - mlog(0, "delivering a bast for this lockres " - "(blocked = %d\n", hi); + mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, " + "blocked %d, node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + hi, lock->ml.node); if (lock->ml.node != dlm->node_num) { ret = dlm_send_proxy_bast(dlm, res, lock, hi); @@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) /* possible that another bast was queued while * we were delivering the last one */ if (!list_empty(&lock->bast_list)) { - mlog(0, "aha another bast got queued while " - "we were finishing the last one. will " - "keep the bast_pending flag set.\n"); + mlog(0, "%s: res %.*s, BAST queued while flushing last " + "one\n", dlm->name, res->lockname.len, + res->lockname.name); } else lock->bast_pending = 0; @@ -675,11 +681,12 @@ static int dlm_thread(void *data) spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { __dlm_print_one_lock_resource(res); - mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n", - res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no", - res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no", - res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no", - res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d," + " dirty %d\n", dlm->name, + !!(res->state & DLM_LOCK_RES_IN_PROGRESS), + !!(res->state & DLM_LOCK_RES_MIGRATING), + !!(res->state & DLM_LOCK_RES_RECOVERING), + !!(res->state & DLM_LOCK_RES_DIRTY)); } BUG_ON(res->owner != dlm->node_num); @@ -693,8 +700,8 @@ static int dlm_thread(void *data) res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); spin_unlock(&dlm->ast_lock); - mlog(0, "delaying list shuffling for in-" - "progress lockres %.*s, state=%d\n", + mlog(0, "%s: res %.*s, inprogress, delay list " + "shuffle, state %d\n", dlm->name, res->lockname.len, res->lockname.name, res->state); delay = 1; @@ -706,10 +713,6 @@ static int dlm_thread(void *data) * spinlock and do NOT have the dlm lock. * safe to reserve/queue asts and run the lists. */ - mlog(0, "calling dlm_shuffle_lists with dlm=%s, " - "res=%.*s\n", dlm->name, - res->lockname.len, res->lockname.name); - /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); res->state &= ~DLM_LOCK_RES_DIRTY; @@ -733,7 +736,8 @@ in_progress: /* unlikely, but we may need to give time to * other tasks */ if (!--n) { - mlog(0, "throttling dlm_thread\n"); + mlog(0, "%s: Throttling dlm thread\n", + dlm->name); break; } } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b2df490a19e..8c5c0eddc36 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) return &ip->ip_vfs_inode; } -static void dlmfs_destroy_inode(struct inode *inode) +static void dlmfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); } +static void dlmfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, dlmfs_i_callback); +} + static void dlmfs_evict_inode(struct inode *inode) { int status; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 19ad145d2af..5dbc3062b4f 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -137,9 +137,7 @@ check_gen: } result = d_obtain_alias(inode); - if (!IS_ERR(result)) - result->d_op = &ocfs2_dentry_ops; - else + if (IS_ERR(result)) mlog_errno(PTR_ERR(result)); bail: @@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) } parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); - if (!IS_ERR(parent)) - parent->d_op = &ocfs2_dentry_ops; bail_unlock: ocfs2_inode_unlock(dir, 0); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f6cba566429..a6651956482 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1307,10 +1307,13 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask) +int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) { int ret; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + mlog_entry_void(); ret = ocfs2_inode_lock(inode, NULL, 0); @@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask) goto out; } - ret = generic_permission(inode, mask, ocfs2_check_acl); + ret = generic_permission(inode, mask, flags, ocfs2_check_acl); ocfs2_inode_unlock(inode, 0); out: @@ -1986,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); } -static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, +static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_space_resv sr; int change_size = 1; + int cmd = OCFS2_IOC_RESVSP64; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - if (mode & FALLOC_FL_KEEP_SIZE) change_size = 0; + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = OCFS2_IOC_UNRESVSP64; + sr.l_whence = 0; sr.l_start = (s64)offset; sr.l_len = (s64)len; - return __ocfs2_change_file_space(NULL, inode, offset, - OCFS2_IOC_RESVSP64, &sr, change_size); + return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr, + change_size); } int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, @@ -2603,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, - .fallocate = ocfs2_fallocate, .fiemap = ocfs2_fiemap, }; @@ -2635,6 +2641,7 @@ const struct file_operations ocfs2_fops = { .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, + .fallocate = ocfs2_fallocate, }; const struct file_operations ocfs2_dops = { diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7..f5afbbef670 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int ocfs2_permission(struct inode *inode, int mask); +int ocfs2_permission(struct inode *inode, int mask, unsigned int flags); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f935fd6600d..4068c6c4c6f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, * #1 and #2 can be simply solved by never taking the lock * here for system files (which are the only type we read * during mount). It's a heavier approach, but our main - * concern is user-accesible files anyway. + * concern is user-accessible files anyway. * * #3 works itself out because we'll eventually take the * cluster lock before trusting anything anyway. diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ff5744e1e36..849fb4a2e81 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, spin_unlock(&oi->ip_lock); bail_add: - dentry->d_op = &ocfs2_dentry_ops; ret = d_splice_alias(inode, dentry); if (inode) { @@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir, mlog_errno(status); goto leave; } - dentry->d_op = &ocfs2_dentry_ops; status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, @@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry, } ihold(inode); - dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); out_commit: @@ -1017,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, * An error return must mean that no cluster locks * were held on function exit. */ - if (oi1->ip_blkno != oi2->ip_blkno) + if (oi1->ip_blkno != oi2->ip_blkno) { ocfs2_inode_unlock(inode2, 1); + brelse(*bh2); + *bh2 = NULL; + } if (status != -ENOENT) mlog_errno(status); @@ -1794,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir, mlog_errno(status); goto bail; } - dentry->d_op = &ocfs2_dentry_ops; status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, @@ -2459,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto out_commit; } - dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; out_commit: diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 70dd3b1798f..51cd6898e7f 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -420,6 +420,11 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; + /* + * How many clusters in our truncate log. + * It must be protected by osb_tl_inode->i_mutex. + */ + unsigned int truncated_clusters; struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 5fed60de763..71998d4d61d 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used - * allocation group. This helps us mantain some + * allocation group. This helps us maintain some * contiguousness across allocations. */ status = ocfs2_search_one_group(ac, handle, bits_wanted, min_bits, res, &bits_left); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cfeab7ce369..38f986d2447 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -569,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void ocfs2_destroy_inode(struct inode *inode) +static void ocfs2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } +static void ocfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ocfs2_i_callback); +} + static unsigned long long ocfs2_max_file_offset(unsigned int bbits, unsigned int cbits) { @@ -986,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) } /* Handle quota on quotactl */ -static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, - char *path) +static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) { unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; @@ -1006,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type) } static const struct quotactl_ops ocfs2_quotactl_ops = { - .quota_on = ocfs2_quota_on, + .quota_on_meta = ocfs2_quota_on, .quota_off = ocfs2_quota_off, .quota_sync = dquot_quota_sync, .get_info = dquot_get_dqinfo, @@ -2090,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; + sb->s_d_op = &ocfs2_dentry_ops; sb->s_export_op = &ocfs2_export_ops; sb->s_qcop = &ocfs2_quotactl_ops; sb->dq_op = &ocfs2_quota_operations; diff --git a/fs/open.c b/fs/open.c index 4197b9ed023..e52389e1f05 100644 --- a/fs/open.c +++ b/fs/open.c @@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + /* Punch hole must have keep size set */ + if ((mode & FALLOC_FL_PUNCH_HOLE) && + !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; if (!(file->f_mode & FMODE_WRITE)) @@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - if (!inode->i_op->fallocate) + if (!file->f_op->fallocate) return -EOPNOTSUPP; - return inode->i_op->fallocate(inode, mode, offset, len); + return file->f_op->fallocate(file, mode, offset, len); } SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 911e61f348f..a2a5bff774e 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void openprom_destroy_inode(struct inode *inode) +static void openprom_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(op_inode_cachep, OP_I(inode)); } +static void openprom_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, openprom_i_callback); +} + static struct inode *openprom_iget(struct super_block *sb, ino_t ino) { struct inode *inode; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 0a8b0ad0c7e..9c21119512b 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } +ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%d\n", p->policy ? 1 : 0); +} + ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev, static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, NULL); @@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, + &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, @@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } +void __delete_partition(struct hd_struct *part) +{ + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; @@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + hd_struct_put(part); } static ssize_t whole_disk_show(struct device *dev, @@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); + hd_ref_init(p); return p; out_free_info: @@ -507,65 +522,6 @@ out_put: return ERR_PTR(err); } -/* Not exported, helper to add_disk(). */ -void register_disk(struct gendisk *disk) -{ - struct device *ddev = disk_to_dev(disk); - struct block_device *bdev; - struct disk_part_iter piter; - struct hd_struct *part; - int err; - - ddev->parent = disk->driverfs_dev; - - dev_set_name(ddev, disk->disk_name); - - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); - - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - - /* No minors to use for partitions */ - if (!disk_partitionable(disk)) - goto exit; - - /* No such device (e.g., media were just removed) */ - if (!get_capacity(disk)) - goto exit; - - bdev = bdget_disk(disk, 0); - if (!bdev) - goto exit; - - bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ); - if (err < 0) - goto exit; - blkdev_put(bdev, FMODE_READ); - -exit: - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); -} - static bool disk_unlock_native_capacity(struct gendisk *disk) { const struct block_device_operations *bdops = disk->fops; @@ -728,33 +684,3 @@ fail: } EXPORT_SYMBOL(read_dev_sector); - -void del_gendisk(struct gendisk *disk) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - /* invalidate stuff */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); - while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); - delete_partition(disk, part->partno); - } - disk_part_iter_exit(&piter); - - invalidate_partition(disk, 0); - blk_free_devt(disk_to_dev(disk)->devt); - set_capacity(disk, 0); - disk->flags &= ~GENHD_FL_UP; - unlink_gendisk(disk); - part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; - - kobject_put(disk->part0.holder_dir); - kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; - if (!sysfs_deprecated) - sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); - device_del(disk_to_dev(disk)); -} diff --git a/fs/pipe.c b/fs/pipe.c index 04629f36e39..da42f7db50d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -441,7 +441,7 @@ redo: break; } if (do_wakeup) { - wake_up_interruptible_sync(&pipe->wait); + wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } pipe_wait(pipe); @@ -450,7 +450,7 @@ redo: /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible_sync(&pipe->wait); + wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) @@ -612,7 +612,7 @@ redo2: break; } if (do_wakeup) { - wake_up_interruptible_sync(&pipe->wait); + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } @@ -623,7 +623,7 @@ redo2: out: mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible_sync(&pipe->wait); + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) @@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw) if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible_sync(&pipe->wait); + wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } @@ -999,12 +999,11 @@ struct file *create_write_pipe(int flags) goto err; err = -ENOMEM; - path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); + path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); if (!path.dentry) goto err_inode; path.mnt = mntget(pipe_mnt); - path.dentry->d_op = &pipefs_dentry_operations; d_instantiate(path.dentry, inode); err = -ENFILE; @@ -1253,6 +1252,10 @@ out: return ret; } +static const struct super_operations pipefs_ops = { + .destroy_inode = free_inode_nonrcu, +}; + /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need @@ -1262,7 +1265,8 @@ out: static struct dentry *pipefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); + return mount_pseudo(fs_type, "pipe:", &pipefs_ops, + &pipefs_dentry_operations, PIPEFS_MAGIC); } static struct file_system_type pipe_fs_type = { diff --git a/fs/pnode.c b/fs/pnode.c index 8066b8dd748..d42514e3238 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -288,7 +288,7 @@ out: */ static inline int do_refcount_check(struct vfsmount *mnt, int count) { - int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts; + int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts; return (mycount > count); } @@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count) * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. * - * vfsmount lock must be held for read or write + * vfsmount lock must be held for write */ int propagate_mount_busy(struct vfsmount *mnt, int refcnt) { diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 6a0068841d9..15af6222f8a 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -1,5 +1,5 @@ config PROC_FS - bool "/proc file system support" if EMBEDDED + bool "/proc file system support" if EXPERT default y help This is a virtual file system providing information about the status @@ -40,7 +40,7 @@ config PROC_VMCORE Exports the dump image of crashed kernel in ELF format. config PROC_SYSCTL - bool "Sysctl support (/proc/sys)" if EMBEDDED + bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS select SYSCTL default y @@ -61,7 +61,7 @@ config PROC_SYSCTL config PROC_PAGE_MONITOR default y depends on PROC_FS && MMU - bool "Enable /proc page monitoring" if EMBEDDED + bool "Enable /proc page monitoring" if EXPERT help Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 2758e2afc51..df434c5f28f 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -10,6 +10,7 @@ proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ proc_tty.o proc-y += cmdline.o +proc-y += consoles.o proc-y += cpuinfo.o proc-y += devices.o proc-y += interrupts.o diff --git a/fs/proc/array.c b/fs/proc/array.c index fff6572676a..df2b703b9d0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) get_task_comm(tcomm, p); - seq_printf(m, "Name:\t"); + seq_puts(m, "Name:\t"); end = m->buf + m->size; buf = m->buf + m->count; name = tcomm; @@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) buf++; } m->count = buf - m->buf; - seq_printf(m, "\n"); + seq_putc(m, '\n'); } /* @@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d ", GROUP_AT(group_info, g)); put_cred(cred); - seq_printf(m, "\n"); + seq_putc(m, '\n'); } static void render_sigset_t(struct seq_file *m, const char *header, @@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header, { int i; - seq_printf(m, "%s", header); + seq_puts(m, header); i = _NSIG; do { @@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header, seq_printf(m, "%x", x); } while (i >= 4); - seq_printf(m, "\n"); + seq_putc(m, '\n'); } static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, @@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header, { unsigned __capi; - seq_printf(m, "%s", header); + seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); } static inline void task_cap(struct seq_file *m, struct task_struct *p) @@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); + seq_puts(m, "Cpus_allowed:\t"); seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); + seq_putc(m, '\n'); + seq_puts(m, "Cpus_allowed_list:\t"); seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); } - seq_printf(m, "%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + size, resident, shared, text, data); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 182845147fe..9d096e82b20 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < 32; i++) { - if (task->latency_record[i].backtrace[0]) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { int q; - seq_printf(m, "%i %li %li ", - task->latency_record[i].count, - task->latency_record[i].time, - task->latency_record[i].max); + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_SYMBOL_LEN]; - char *c; - if (!task->latency_record[i].backtrace[q]) + unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (task->latency_record[i].backtrace[q] == ULONG_MAX) + if (bt == ULONG_MAX) break; - sprint_symbol(sym, task->latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); + seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); } } @@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v) static int proc_single_open(struct inode *inode, struct file *filp) { - int ret; - ret = single_open(filp, proc_single_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; + return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { @@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj && + if (oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; @@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, atomic_dec(&task->mm->oom_disable_count); } task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. @@ -1386,9 +1375,77 @@ sched_write(struct file *file, const char __user *buf, static int sched_open(struct inode *inode, struct file *filp) { + return single_open(filp, sched_show, inode); +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + long nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = strict_strtol(strstrip(buffer), 0, &nice); + if (err) + return -EINVAL; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = nice; + err = proc_sched_autogroup_set_nice(p, &err); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ int ret; - ret = single_open(filp, sched_show, NULL); + ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; @@ -1397,15 +1454,15 @@ static int sched_open(struct inode *inode, struct file *filp) return ret; } -static const struct file_operations proc_pid_sched_operations = { - .open = sched_open, +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, .read = seq_read, - .write = sched_write, + .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; -#endif +#endif /* CONFIG_SCHED_AUTOGROUP */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) @@ -1454,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v) static int comm_open(struct inode *inode, struct file *filp) { - int ret; - - ret = single_open(filp, comm_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; + return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { @@ -1719,10 +1768,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; const struct cred *cred; + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { @@ -1744,7 +1799,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_delete_dentry(struct dentry * dentry) +static int pid_delete_dentry(const struct dentry * dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, @@ -1888,12 +1943,19 @@ static int proc_fd_link(struct inode *inode, struct path *path) static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int fd = proc_fd(inode); + struct inode *inode; + struct task_struct *task; + int fd; struct files_struct *files; const struct cred *cred; + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + if (task) { files = get_files_struct(task); if (files) { @@ -1969,7 +2031,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; - dentry->d_op = &tid_fd_dentry_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, NULL)) @@ -2101,11 +2163,13 @@ static const struct file_operations proc_fd_operations = { * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -static int proc_fd_permission(struct inode *inode, int mask) +static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { int rv; - rv = generic_permission(inode, mask, NULL); + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) @@ -2137,7 +2201,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; - dentry->d_op = &tid_fd_dentry_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, NULL)) @@ -2196,7 +2260,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) @@ -2563,8 +2627,14 @@ static const struct pid_entry proc_base_stuff[] = { */ static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); if (task) { put_task_struct(task); return 1; @@ -2615,7 +2685,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - dentry->d_op = &proc_base_dentry_operations; + d_set_d_op(dentry, &proc_base_dentry_operations); d_add(dentry, inode); error = NULL; out: @@ -2733,6 +2803,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), @@ -2926,7 +2999,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ @@ -3169,7 +3242,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c new file mode 100644 index 00000000000..b701eaa482b --- /dev/null +++ b/fs/proc/consoles.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2010 Werner Fink, Jiri Slaby + * + * Licensed under GPLv2 + */ + +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/tty_driver.h> + +/* + * This is handler for /proc/consoles + */ +static int show_console_dev(struct seq_file *m, void *v) +{ + static const struct { + short flag; + char name; + } con_flags[] = { + { CON_ENABLED, 'E' }, + { CON_CONSDEV, 'C' }, + { CON_BOOT, 'B' }, + { CON_PRINTBUFFER, 'p' }, + { CON_BRL, 'b' }, + { CON_ANYTIME, 'a' }, + }; + char flags[ARRAY_SIZE(con_flags) + 1]; + struct console *con = v; + unsigned int a; + int len; + dev_t dev = 0; + + if (con->device) { + const struct tty_driver *driver; + int index; + driver = con->device(con, &index); + if (driver) { + dev = MKDEV(driver->major, driver->minor_start); + dev += index; + } + } + + for (a = 0; a < ARRAY_SIZE(con_flags); a++) + flags[a] = (con->flags & con_flags[a].flag) ? + con_flags[a].name : ' '; + flags[a] = 0; + + seq_printf(m, "%s%d%n", con->name, con->index, &len); + len = 21 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-', + con->write ? 'W' : '-', con->unblank ? 'U' : '-', + flags); + if (dev) + seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); + + seq_printf(m, "\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + struct console *con; + loff_t off = 0; + + console_lock(); + for_each_console(con) + if (off++ == *pos) + break; + + return con; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct console *con = v; + ++*pos; + return con->next; +} + +static void c_stop(struct seq_file *m, void *v) +{ + console_unlock(); +} + +static const struct seq_operations consoles_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_console_dev +}; + +static int consoles_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &consoles_op); +} + +static const struct file_operations proc_consoles_operations = { + .open = consoles_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_consoles_init(void) +{ + proc_create("consoles", 0, NULL, &proc_consoles_operations); + return 0; +} +module_init(proc_consoles_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 59ee7da959c..b14347167c3 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v) if (i < CHRDEV_MAJOR_HASH_SIZE) { if (i == 0) - seq_printf(f, "Character devices:\n"); + seq_puts(f, "Character devices:\n"); chrdev_show(f, i); } #ifdef CONFIG_BLOCK else { i -= CHRDEV_MAJOR_HASH_SIZE; if (i == 0) - seq_printf(f, "\nBlock devices:\n"); + seq_puts(f, "\nBlock devices:\n"); blkdev_show(f, i); } #endif diff --git a/fs/proc/generic.c b/fs/proc/generic.c index dd29f033766..01e07f2a188 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = { * smarter: we could keep a "volatile" flag in the * inode to indicate which ones to keep. */ -static int proc_delete_dentry(struct dentry * dentry) +static int proc_delete_dentry(const struct dentry * dentry) { return 1; } @@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, if (de->namelen != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - unsigned int ino; - - ino = de->low_ino; pde_get(de); spin_unlock(&proc_subdir_lock); error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); + inode = proc_get_inode(dir->i_sb, de); goto out_unlock; } } @@ -439,7 +436,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, out_unlock: if (inode) { - dentry->d_op = &proc_dentry_operations; + d_set_d_op(dentry, &proc_dentry_operations); d_add(dentry, inode); return NULL; } @@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { - unsigned int ino = de->low_ino; - - if (ino < PROC_DYNAMIC_FIRST) - return; - - release_inode_number(ino); + release_inode_number(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); @@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) wait_for_completion(de->pde_unload_completion); - goto continue_removing; + spin_lock(&de->pde_unload_lock); } - spin_unlock(&de->pde_unload_lock); -continue_removing: - spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3ddb6068177..176ce4cda68 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -65,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb) return inode; } -static void proc_destroy_inode(struct inode *inode) +static void proc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; @@ -409,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, - struct proc_dir_entry *de) +struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode * inode; - inode = iget_locked(sb, ino); + inode = iget_locked(sb, de->low_ino); if (!inode) return NULL; if (inode->i_state & I_NEW) { @@ -464,7 +470,7 @@ int proc_fill_super(struct super_block *s) s->s_time_gran = 1; pde_get(&proc_root); - root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); + root_inode = proc_get_inode(s, &proc_root); if (!root_inode) goto out_no_root; root_inode->i_uid = 0; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 1f24a3eddd1..9ad561ded40 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock; struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); unsigned long task_vsize(struct mm_struct *); -int task_statm(struct mm_struct *, int *, int *, int *, int *); +unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, unsigned long *, unsigned long *); void task_mem(struct seq_file *, struct mm_struct *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) @@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde); extern struct vfsmount *proc_mnt; int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); +struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); /* * These are generic /proc routines that use the internal diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6f37c391468..d245cb23dd7 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp) static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, - .llseek = generic_file_llseek, + .llseek = default_llseek, }; #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239cfd97..ed257d14156 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif , K(i.totalram), K(i.freeram), @@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + @@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif ); hugetlb_report_meminfo(m); diff --git a/fs/proc/page.c b/fs/proc/page.c index 3b8b4566033..6d8e6a9e93a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, ppage = pfn_to_page(pfn); else ppage = NULL; - if (!ppage) + if (!ppage || PageSlab(ppage)) pcount = 0; else pcount = page_mapcount(ppage); @@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page) if (PageHuge(page)) u |= 1 << KPF_HUGE; - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b652cb00906..09a1f92a34e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -5,6 +5,7 @@ #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/security.h> +#include <linux/namei.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -120,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; err = NULL; - dentry->d_op = &proc_sys_dentry_operations; + d_set_d_op(dentry, &proc_sys_dentry_operations); d_add(dentry, inode); out: @@ -201,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, dput(child); return -ENOMEM; } else { - child->d_op = &proc_sys_dentry_operations; + d_set_d_op(child, &proc_sys_dentry_operations); d_add(child, inode); } } else { @@ -294,7 +295,7 @@ out: return ret; } -static int proc_sys_permission(struct inode *inode, int mask) +static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) { /* * sysctl entries that are not writeable, @@ -304,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask) struct ctl_table *table; int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; @@ -389,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = { static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; return !PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_delete(struct dentry *dentry) +static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, - struct qstr *name) +static int proc_sys_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct dentry *dentry = container_of(qstr, struct dentry, d_name); - if (qstr->len != name->len) + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + if (!inode) + return 0; + if (name->len != len) return 1; - if (memcmp(qstr->name, name->name, name->len)) + if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); + return !sysctl_is_seen(PROC_I(inode)->sysctl); } static const struct dentry_operations proc_sys_dentry_operations = { diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 83adcc86943..cb761f01030 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p, } switch (p->type) { case TTY_DRIVER_TYPE_SYSTEM: - seq_printf(m, "system"); + seq_puts(m, "system"); if (p->subtype == SYSTEM_TYPE_TTY) - seq_printf(m, ":/dev/tty"); + seq_puts(m, ":/dev/tty"); else if (p->subtype == SYSTEM_TYPE_SYSCONS) - seq_printf(m, ":console"); + seq_puts(m, ":console"); else if (p->subtype == SYSTEM_TYPE_CONSOLE) - seq_printf(m, ":vtmaster"); + seq_puts(m, ":vtmaster"); break; case TTY_DRIVER_TYPE_CONSOLE: - seq_printf(m, "console"); + seq_puts(m, "console"); break; case TTY_DRIVER_TYPE_SERIAL: - seq_printf(m, "serial"); + seq_puts(m, "serial"); break; case TTY_DRIVER_TYPE_PTY: if (p->subtype == PTY_TYPE_MASTER) - seq_printf(m, "pty:master"); + seq_puts(m, "pty:master"); else if (p->subtype == PTY_TYPE_SLAVE) - seq_printf(m, "pty:slave"); + seq_puts(m, "pty:slave"); else - seq_printf(m, "pty"); + seq_puts(m, "pty"); break; default: seq_printf(m, "type:%d.%d", p->type, p->subtype); @@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v) /* pseudo-drivers first */ seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); - seq_printf(m, "system:/dev/tty\n"); + seq_puts(m, "system:/dev/tty\n"); seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); - seq_printf(m, "system:console\n"); + seq_puts(m, "system:console\n"); #ifdef CONFIG_UNIX98_PTYS seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); - seq_printf(m, "system\n"); + seq_puts(m, "system\n"); #endif #ifdef CONFIG_VT seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); - seq_printf(m, "system:vtmaster\n"); + seq_puts(m, "system:vtmaster\n"); #endif } diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 37994737c98..62604be9f58 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v) { int i, j; - seq_printf(p, " "); + seq_puts(p, " "); for_each_possible_cpu(i) seq_printf(p, "CPU%-8d", i); - seq_printf(p, "\n"); + seq_putc(p, '\n'); for (i = 0; i < NR_SOFTIRQS; i++) { seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); - seq_printf(p, "\n"); + seq_putc(p, '\n'); } return 0; } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index e15a19c93ba..1cffa2b8a2f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v) for (i = 0; i < NR_SOFTIRQS; i++) seq_printf(p, " %u", per_softirq_sums[i]); - seq_printf(p, "\n"); + seq_putc(p, '\n'); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c126c83b9a4..60b914860f8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm) return PAGE_SIZE * mm->total_vm; } -int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) { *shared = get_mm_counter(mm, MM_FILEPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) @@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v) "Anonymous: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v) mss.anonymous >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index cb6306e6384..b535d3e5d5f 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm) return vsize; } -int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) { struct vm_area_struct *vma; struct vm_region *region; struct rb_node *p; - int size = kobjsize(mm); + unsigned long size = kobjsize(mm); down_read(&mm->mmap_sem); for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 2367fb3f70b..74802bc5ded 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void) /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || - !vmcore_elf_check_arch(&ehdr) || + !vmcore_elf64_check_arch(&ehdr) || ehdr.e_ident[EI_CLASS] != ELFCLASS64 || ehdr.e_ident[EI_VERSION] != EV_CURRENT || ehdr.e_version != EV_CURRENT || diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index fcada42f1aa..e63b4171d58 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void qnx4_destroy_inode(struct inode *inode) +static void qnx4_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } +static void qnx4_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, qnx4_i_callback); +} + static void init_once(void *foo) { struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0fed41e6efc..a2a622e079f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); void __quota_error(struct super_block *sb, const char *func, - const char *fmt, ...) + const char *fmt, ...) { - va_list args; - if (printk_ratelimit()) { + va_list args; + struct va_format vaf; + va_start(args, fmt); - printk(KERN_ERR "Quota error (device %s): %s: ", - sb->s_id, func); - vprintk(fmt, args); - printk("\n"); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "Quota error (device %s): %s: %pV\n", + sb->s_id, func, &vaf); + va_end(args); } } @@ -2185,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type) } EXPORT_SYMBOL(dquot_resume); -int dquot_quota_on_path(struct super_block *sb, int type, int format_id, - struct path *path) +int dquot_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) { int error = security_quota_on(path->dentry); if (error) @@ -2200,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id, DQUOT_LIMITS_ENABLED); return error; } -EXPORT_SYMBOL(dquot_quota_on_path); - -int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name) -{ - struct path path; - int error; - - error = kern_path(name, LOOKUP_FOLLOW, &path); - if (!error) { - error = dquot_quota_on_path(sb, type, format_id, &path); - path_put(&path); - } - return error; -} EXPORT_SYMBOL(dquot_quota_on); /* diff --git a/fs/quota/quota.c b/fs/quota/quota.c index b299961e1ed..b34bdb25490 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -64,18 +64,15 @@ static int quota_sync_all(int type) } static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, - void __user *addr) + struct path *path) { - char *pathname; - int ret = -ENOSYS; - - pathname = getname(addr); - if (IS_ERR(pathname)) - return PTR_ERR(pathname); - if (sb->s_qcop->quota_on) - ret = sb->s_qcop->quota_on(sb, type, id, pathname); - putname(pathname); - return ret; + if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta) + return -ENOSYS; + if (sb->s_qcop->quota_on_meta) + return sb->s_qcop->quota_on_meta(sb, type, id); + if (IS_ERR(path)) + return PTR_ERR(path); + return sb->s_qcop->quota_on(sb, type, id, path); } static int quota_getfmt(struct super_block *sb, int type, void __user *addr) @@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, - void __user *addr) + void __user *addr, struct path *path) { int ret; @@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, switch (cmd) { case Q_QUOTAON: - return quota_quotaon(sb, type, cmd, id, addr); + return quota_quotaon(sb, type, cmd, id, path); case Q_QUOTAOFF: if (!sb->s_qcop->quota_off) return -ENOSYS; @@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, { uint cmds, type; struct super_block *sb = NULL; + struct path path, *pathp = NULL; int ret; cmds = cmd >> SUBCMDSHIFT; @@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, return -ENODEV; } + /* + * Path for quotaon has to be resolved before grabbing superblock + * because that gets s_umount sem which is also possibly needed by path + * resolution (think about autofs) and thus deadlocks could arise. + */ + if (cmds == Q_QUOTAON) { + ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path); + if (ret) + pathp = ERR_PTR(ret); + else + pathp = &path; + } + sb = quotactl_block(special); if (IS_ERR(sb)) return PTR_ERR(sb); - ret = do_quotactl(sb, type, cmds, id, addr); + ret = do_quotactl(sb, type, cmds, id, addr, pathp); drop_super(sb); + if (pathp && !IS_ERR(pathp)) + path_put(pathp); return ret; } diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 9e48874eabc..e41c1becf09 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, return -ENOMEM; ret = read_blk(info, *blk, buf); if (ret < 0) { - quota_error(dquot->dq_sb, "Can't read quota data " - "block %u", blk); + quota_error(dquot->dq_sb, "Can't read quota data block %u", + *blk); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); @@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = write_blk(info, *blk, buf); if (ret < 0) - quota_error(dquot->dq_sb, "Can't write quota " - "tree block %u", blk); + quota_error(dquot->dq_sb, + "Can't write quota tree block %u", + *blk); } } out_buf: diff --git a/fs/read_write.c b/fs/read_write.c index 5d431bacbea..5520f8ad550 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -30,18 +30,9 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -static int -__negative_fpos_check(struct file *file, loff_t pos, size_t count) +static inline int unsigned_offsets(struct file *file) { - /* - * pos or pos+count is negative here, check overflow. - * too big "count" will be caught in rw_verify_area(). - */ - if ((pos < 0) && (pos + count < pos)) - return -EOVERFLOW; - if (file->f_mode & FMODE_UNSIGNED_OFFSET) - return 0; - return -EINVAL; + return file->f_mode & FMODE_UNSIGNED_OFFSET; } /** @@ -75,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) break; } - if (offset < 0 && __negative_fpos_check(file, offset, 0)) + if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > inode->i_sb->s_maxbytes) return -EINVAL; @@ -152,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) offset += file->f_pos; } retval = -EINVAL; - if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { + if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -252,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count if (unlikely((ssize_t) count < 0)) return retval; pos = *ppos; - if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { - retval = __negative_fpos_check(file, pos, count); - if (retval) + if (unlikely(pos < 0)) { + if (!unsigned_offsets(file)) + return retval; + if (count >= -pos) /* both values are in 0..LLONG_MAX */ + return -EOVERFLOW; + } else if (unlikely((loff_t) (pos + count) < 0)) { + if (!unsigned_offsets(file)) return retval; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d31bce1a9f9..3eea859e699 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2551,8 +2551,6 @@ static int release_journal_dev(struct super_block *super, result = 0; if (journal->j_dev_bd != NULL) { - if (journal->j_dev_bd->bd_dev != super->s_dev) - bd_release(journal->j_dev_bd); result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); journal->j_dev_bd = NULL; } @@ -2570,7 +2568,7 @@ static int journal_init_dev(struct super_block *super, { int result; dev_t jdev; - fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; char b[BDEVNAME_SIZE]; result = 0; @@ -2584,7 +2582,10 @@ static int journal_init_dev(struct super_block *super, /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { - journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); + if (jdev == super->s_dev) + blkdev_mode &= ~FMODE_EXCL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, + journal); journal->j_dev_mode = blkdev_mode; if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); @@ -2593,22 +2594,14 @@ static int journal_init_dev(struct super_block *super, "cannot init journal device '%s': %i", __bdevname(jdev, b), result); return result; - } else if (jdev != super->s_dev) { - result = bd_claim(journal->j_dev_bd, journal); - if (result) { - blkdev_put(journal->j_dev_bd, blkdev_mode); - return result; - } - + } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); - } return 0; } journal->j_dev_mode = blkdev_mode; - journal->j_dev_bd = open_bdev_exclusive(jdev_name, - blkdev_mode, journal); + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index adbc6f53851..45de98b5946 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int l va_list args; int mode, first, last; - va_start(args, bh); - if (!bh) { printk("print_block: buffer is NULL\n"); return; } + va_start(args, bh); + mode = va_arg(args, int); first = va_arg(args, int); last = va_arg(args, int); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b243117b875..0aab04f4682 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -529,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void reiserfs_destroy_inode(struct inode *inode) +static void reiserfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } +static void reiserfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, reiserfs_i_callback); +} + static void init_once(void *foo) { struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; @@ -625,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, char *); +static int reiserfs_quota_on(struct super_block *, int, int, struct path *); static const struct dquot_operations reiserfs_quota_operations = { .write_dquot = reiserfs_write_dquot, @@ -2041,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - char *name) + struct path *path) { int err; - struct path path; struct inode *inode; struct reiserfs_transaction_handle th; if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) return -EINVAL; - err = kern_path(name, LOOKUP_FOLLOW, &path); - if (err) - return err; /* Quotafile not on the same filesystem? */ - if (path.mnt->mnt_sb != sb) { + if (path->mnt->mnt_sb != sb) { err = -EXDEV; goto out; } - inode = path.dentry->d_inode; + inode = path->dentry->d_inode; /* We must not pack tails for quota files on reiserfs for quota IO to work */ if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { err = reiserfs_unpack(inode, NULL); @@ -2075,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, /* Journaling quota? */ if (REISERFS_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ - if (path.dentry->d_parent != sb->s_root) + if (path->dentry->d_parent != sb->s_root) reiserfs_warning(sb, "super-6521", "Quota file not on filesystem root. " "Journalled quota will not work."); @@ -2094,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = dquot_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on(sb, type, format_id, path); out: - path_put(&path); return err; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 5d04a7828e7..3cfb2e93364 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -870,11 +870,14 @@ out: return err; } -static int reiserfs_check_acl(struct inode *inode, int mask) +static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags) { struct posix_acl *acl; int error = -EAGAIN; /* do regular unix permission checks by default */ + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); if (acl) { @@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct inode *inode, int mask) +int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. @@ -965,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask) * Stat data v1 doesn't support ACLs. */ if (get_inode_sd_version(inode) != STAT_DATA_V1) - return generic_permission(inode, mask, reiserfs_check_acl); + return generic_permission(inode, mask, flags, + reiserfs_check_acl); #endif - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; return -EPERM; } @@ -990,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s) strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; - dentry->d_op = &xattr_lookup_poison_ops; + d_set_d_op(dentry, &xattr_lookup_poison_ops); if (dentry->d_inode) dentry->d_inode->i_flags |= S_PRIVATE; } else diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 6647f90e55c..2305e3121cb 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) /* * return a spent inode to the slab cache */ -static void romfs_destroy_inode(struct inode *inode) +static void romfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } +static void romfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, romfs_i_callback); +} + /* * get filesystem statistics */ diff --git a/fs/select.c b/fs/select.c index b7b10aa3086..e56560d2b08 100644 --- a/fs/select.c +++ b/fs/select.c @@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, rts.tv_sec = rts.tv_nsec = 0; if (timeval) { + if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) + memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; diff --git a/fs/splice.c b/fs/splice.c index ce2f02579e3..50a5d978da1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; loff_t pos = sd->pos; - int ret, more; - - ret = buf->ops->confirm(pipe, buf); - if (!ret) { - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - if (file->f_op && file->f_op->sendpage) - ret = file->f_op->sendpage(file, buf->page, buf->offset, - sd->len, &pos, more); - else - ret = -EINVAL; - } + int more; - return ret; + if (!likely(file->f_op && file->f_op->sendpage)) + return -EINVAL; + + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + return file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); } /* @@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *fsdata; int ret; - /* - * make sure the data in this buffer is uptodate - */ - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; - offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; @@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, if (sd->len > sd->total_len) sd->len = sd->total_len; - ret = actor(pipe, buf, sd); - if (ret <= 0) { + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } + + ret = actor(pipe, buf, sd); + if (ret <= 0) + return ret; + buf->offset += ret; buf->len -= ret; @@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, int ret; void *data; - ret = buf->ops->confirm(pipe, buf); - if (ret) - return ret; - data = buf->ops->map(pipe, buf, 0); ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); buf->ops->unmap(pipe, buf, data); @@ -1495,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, char *src; int ret; - ret = buf->ops->confirm(pipe, buf); - if (unlikely(ret)) - return ret; - /* * See if we can use the atomic maps, by prefaulting in the * pages and doing an atomic copy diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index e5f63da64d0..aa68a8a3151 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -29,7 +29,6 @@ config SQUASHFS config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS - default n help Saying Y here includes support for extended attributes (xattrs). Xattrs are name:value pairs associated with inodes by @@ -40,7 +39,6 @@ config SQUASHFS_XATTR config SQUASHFS_LZO bool "Include support for LZO compressed file systems" depends on SQUASHFS - default n select LZO_DECOMPRESS help Saying Y here includes support for reading Squashfs file systems @@ -53,10 +51,24 @@ config SQUASHFS_LZO If unsure, say N. +config SQUASHFS_XZ + bool "Include support for XZ compressed file systems" + depends on SQUASHFS + select XZ_DEC + help + Saying Y here includes support for reading Squashfs file systems + compressed with XZ compresssion. XZ gives better compression than + the default zlib compression, at the expense of greater CPU and + memory overhead. + + XZ is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" depends on SQUASHFS - default n help Saying Y here allows you to specify cache size. diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7672bac8d32..cecf2bea07a 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o +squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 653c030eb84..8ab48bc2fa7 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -34,7 +34,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" @@ -64,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb, *length = (unsigned char) bh->b_data[*offset] | (unsigned char) bh->b_data[*offset + 1] << 8; *offset += 2; + + if (*offset == msblk->devblksize) { + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *offset = 0; + } } return bh; diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 57314bee905..26b15ae34d6 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -55,7 +55,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" /* diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 24af9ce9722..a5940e54c4d 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -27,7 +27,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "decompressor.h" #include "squashfs.h" @@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { }; #ifndef CONFIG_SQUASHFS_LZO -static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { +static const struct squashfs_decompressor squashfs_lzo_comp_ops = { NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif +#ifndef CONFIG_SQUASHFS_XZ +static const struct squashfs_decompressor squashfs_xz_comp_ops = { + NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 +}; +#endif + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, - &squashfs_lzma_unsupported_comp_ops, -#ifdef CONFIG_SQUASHFS_LZO &squashfs_lzo_comp_ops, -#else - &squashfs_lzo_unsupported_comp_ops, -#endif + &squashfs_xz_comp_ops, + &squashfs_lzma_unsupported_comp_ops, &squashfs_unknown_comp_ops }; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 7425f80783f..3b305a70f7a 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk, return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, length, srclength, pages); } + +#ifdef CONFIG_SQUASHFS_XZ +extern const struct squashfs_decompressor squashfs_xz_comp_ops; +#endif + +#ifdef CONFIG_SQUASHFS_LZO +extern const struct squashfs_decompressor squashfs_lzo_comp_ops; +#endif + #endif diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index 7c90bbd6879..7eef571443c 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -39,7 +39,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" /* diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index b7f64bcd2b7..d8f32452638 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -37,7 +37,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" /* diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 5d87789bf1c..7da759e34c5 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -29,7 +29,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 5d45569d5f7..ba729d80887 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -27,11 +27,6 @@ #define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) -static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) -{ - return list_entry(inode, struct squashfs_inode_info, vfs_inode); -} - /* block.c */ extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, int, int); @@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[]; /* zlib_wrapper.c */ extern const struct squashfs_decompressor squashfs_zlib_comp_ops; - -/* lzo_wrapper.c */ -extern const struct squashfs_decompressor squashfs_lzo_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index c5137fc9ab1..39533feffd6 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -238,6 +238,7 @@ struct meta_index { #define ZLIB_COMPRESSION 1 #define LZMA_COMPRESSION 2 #define LZO_COMPRESSION 3 +#define XZ_COMPRESSION 4 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index d3e3a37f28a..359baefc01f 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -45,4 +45,10 @@ struct squashfs_inode_info { }; struct inode vfs_inode; }; + + +static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) +{ + return list_entry(inode, struct squashfs_inode_info, vfs_inode); +} #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 24de30ba34c..20700b9f2b4 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb) } -static void squashfs_destroy_inode(struct inode *inode) +static void squashfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } +static void squashfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, squashfs_i_callback); +} + static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index d33be5dd6c3..05385dbe146 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -32,7 +32,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c new file mode 100644 index 00000000000..c4eb4001825 --- /dev/null +++ b/fs/squashfs/xz_wrapper.c @@ -0,0 +1,147 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xz_wrapper.c + */ + + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/xz.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" + +struct squashfs_xz { + struct xz_dec *state; + struct xz_buf buf; +}; + +static void *squashfs_xz_init(struct squashfs_sb_info *msblk) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + + struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + + stream->state = xz_dec_init(XZ_PREALLOC, block_size); + if (stream->state == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate xz workspace\n"); + kfree(stream); + return NULL; +} + + +static void squashfs_xz_free(void *strm) +{ + struct squashfs_xz *stream = strm; + + if (stream) { + xz_dec_end(stream->state); + kfree(stream); + } +} + + +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + enum xz_ret xz_err; + int avail, total = 0, k = 0, page = 0; + struct squashfs_xz *stream = msblk->stream; + + mutex_lock(&msblk->read_data_mutex); + + xz_dec_reset(stream->state); + stream->buf.in_pos = 0; + stream->buf.in_size = 0; + stream->buf.out_pos = 0; + stream->buf.out_size = PAGE_CACHE_SIZE; + stream->buf.out = buffer[page++]; + + do { + if (stream->buf.in_pos == stream->buf.in_size && k < b) { + avail = min(length, msblk->devblksize - offset); + length -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + stream->buf.in = bh[k]->b_data + offset; + stream->buf.in_size = avail; + stream->buf.in_pos = 0; + offset = 0; + } + + if (stream->buf.out_pos == stream->buf.out_size + && page < pages) { + stream->buf.out = buffer[page++]; + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } + + xz_err = xz_dec_run(stream->state, &stream->buf); + + if (stream->buf.in_pos == stream->buf.in_size && k < b) + put_bh(bh[k++]); + } while (xz_err == XZ_OK); + + if (xz_err != XZ_STREAM_END) { + ERROR("xz_dec_run error, data probably corrupt\n"); + goto release_mutex; + } + + if (k < b) { + ERROR("xz_uncompress error, input remaining\n"); + goto release_mutex; + } + + total += stream->buf.out_pos; + mutex_unlock(&msblk->read_data_mutex); + return total; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_xz_comp_ops = { + .init = squashfs_xz_init, + .free = squashfs_xz_free, + .decompress = squashfs_xz_uncompress, + .id = XZ_COMPRESSION, + .name = "xz", + .supported = 1 +}; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 7a603874e48..4661ae2b1ce 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -29,7 +29,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" @@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, struct buffer_head **bh, int b, int offset, int length, int srclength, int pages) { - int zlib_err = 0, zlib_init = 0; - int avail, bytes, k = 0, page = 0; + int zlib_err, zlib_init = 0; + int k = 0, page = 0; z_stream *stream = msblk->stream; mutex_lock(&msblk->read_data_mutex); @@ -75,21 +74,14 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, stream->avail_out = 0; stream->avail_in = 0; - bytes = length; do { if (stream->avail_in == 0 && k < b) { - avail = min(bytes, msblk->devblksize - offset); - bytes -= avail; + int avail = min(length, msblk->devblksize - offset); + length -= avail; wait_on_buffer(bh[k]); if (!buffer_uptodate(bh[k])) goto release_mutex; - if (avail == 0) { - offset = 0; - put_bh(bh[k++]); - continue; - } - stream->next_in = bh[k]->b_data + offset; stream->avail_in = avail; offset = 0; @@ -128,6 +120,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto release_mutex; } + if (k < b) { + ERROR("zlib_uncompress error, data remaining\n"); + goto release_mutex; + } + length = stream->total_out; mutex_unlock(&msblk->read_data_mutex); return length; diff --git a/fs/stat.c b/fs/stat.c index 12e90e21390..d5c61cf2b70 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flag & AT_NO_AUTOMOUNT) + lookup_flags |= LOOKUP_NO_AUTOMOUNT; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) diff --git a/fs/super.c b/fs/super.c index ca696155cd9..74e149efed8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -30,6 +30,7 @@ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> +#include <linux/rculist_bl.h> #include "internal.h" @@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_files); #endif INIT_LIST_HEAD(&s->s_instances); - INIT_HLIST_HEAD(&s->s_anon); + INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); init_rwsem(&s->s_umount); @@ -766,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -801,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, /* * s_umount nests inside bd_mutex during - * __invalidate_device(). close_bdev_exclusive() - * acquires bd_mutex and can't be called under - * s_umount. Drop s_umount temporarily. This is safe - * as we're holding an active reference. + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. */ up_write(&s->s_umount); - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); down_write(&s->s_umount); } else { char b[BDEVNAME_SIZE]; @@ -831,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); error: return ERR_PTR(error); } @@ -862,7 +863,8 @@ void kill_block_super(struct super_block *sb) bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); - close_bdev_exclusive(bdev, mode); + WARN_ON_ONCE(!(mode & FMODE_EXCL)); + blkdev_put(bdev, mode | FMODE_EXCL); } EXPORT_SYMBOL(kill_block_super); diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig index f4b67588b9d..8c41feacbac 100644 --- a/fs/sysfs/Kconfig +++ b/fs/sysfs/Kconfig @@ -1,5 +1,5 @@ config SYSFS - bool "sysfs file system support" if EMBEDDED + bool "sysfs file system support" if EXPERT default y help The sysfs filesystem is a virtual filesystem that the kernel uses to diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 7e54bac8c4b..ea9120a830d 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) goto repeat; } -static int sysfs_dentry_delete(struct dentry *dentry) +static int sysfs_dentry_delete(const struct dentry *dentry) { struct sysfs_dirent *sd = dentry->d_fsdata; return !!(sd->s_flags & SYSFS_FLAG_REMOVED); @@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry) static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_dirent *sd; int is_dir; + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + sd = dentry->d_fsdata; mutex_lock(&sysfs_mutex); /* The sysfs dirent has been deleted */ @@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, /* instantiate and hash dentry */ ret = d_find_alias(inode); if (!ret) { - dentry->d_op = &sysfs_dentry_ops; + d_set_d_op(dentry, &sysfs_dentry_ops); dentry->d_fsdata = sysfs_get(sd); d_add(dentry, inode); } else { diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 442f34ff1af..c8769dc222d 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -165,10 +165,7 @@ int sysfs_merge_group(struct kobject *kobj, struct attribute *const *attr; int i; - if (grp) - dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); - else - dir_sd = sysfs_get(kobj->sd); + dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); if (!dir_sd) return -ENOENT; @@ -195,10 +192,7 @@ void sysfs_unmerge_group(struct kobject *kobj, struct sysfs_dirent *dir_sd; struct attribute *const *attr; - if (grp) - dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); - else - dir_sd = sysfs_get(kobj->sd); + dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); if (dir_sd) { for (attr = grp->attrs; *attr; ++attr) sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index cffb1fd8ba3..0a12eb89cd3 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/xattr.h> #include <linux/security.h> #include "sysfs.h" @@ -348,13 +349,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha return -ENOENT; } -int sysfs_permission(struct inode *inode, int mask) +int sysfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct sysfs_dirent *sd = inode->i_private; + struct sysfs_dirent *sd; + + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + + sd = inode->i_private; mutex_lock(&sysfs_mutex); sysfs_refresh_inode(sd, inode); mutex_unlock(&sysfs_mutex); - return generic_permission(inode, mask, NULL); + return generic_permission(inode, mask, flags, NULL); } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d9be60a2e95..3d28af31d86 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -9,6 +9,7 @@ */ #include <linux/lockdep.h> +#include <linux/kobject_ns.h> #include <linux/fs.h> struct sysfs_open_dirent; @@ -200,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); void sysfs_evict_inode(struct inode *inode); int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask); +int sysfs_permission(struct inode *inode, int mask, unsigned int flags); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index de44d067b9e..0630eb969a2 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) return &si->vfs_inode; } -static void sysv_destroy_inode(struct inode *inode) +static void sysv_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); } +static void sysv_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, sysv_i_callback); +} + static void init_once(void *p) { struct sysv_inode_info *si = (struct sysv_inode_info *)p; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 11e7f7d11cd..b427b1208c2 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static int sysv_hash(struct dentry *dentry, struct qstr *qstr) +static int sysv_hash(const struct dentry *dentry, const struct inode *inode, + struct qstr *qstr) { /* Truncate the name in place, avoids having to define a compare function. */ @@ -47,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st struct inode * inode = NULL; ino_t ino; - dentry->d_op = dir->i_sb->s_root->d_op; if (dentry->d_name.len > SYSV_NAMELEN) return ERR_PTR(-ENAMETOOLONG); ino = sysv_inode_by_name(dentry); diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 3d9c62be0c1..f60c196913e 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size) sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; /* set up enough so that it can read an inode */ sb->s_op = &sysv_sops; + if (sbi->s_forced_ro) + sb->s_flags |= MS_RDONLY; + if (sbi->s_truncate) + sb->s_d_op = &sysv_dentry_operations; root_inode = sysv_iget(sb, SYSV_ROOT_INO); if (IS_ERR(root_inode)) { printk("SysV FS: get root inode failed\n"); @@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size) printk("SysV FS: get root dentry failed\n"); return 0; } - if (sbi->s_forced_ro) - sb->s_flags |= MS_RDONLY; - if (sbi->s_truncate) - sb->s_root->d_op = &sysv_dentry_operations; return 1; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 91fac54c70e..6e11c2975dc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) return &ui->vfs_inode; }; +static void ubifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ubifs_inode *ui = ubifs_inode(inode); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ubifs_inode_slab, ui); +} + static void ubifs_destroy_inode(struct inode *inode) { struct ubifs_inode *ui = ubifs_inode(inode); kfree(ui->data); - kmem_cache_free(ubifs_inode_slab, inode); + call_rcu(&inode->i_rcu, ubifs_i_callback); } /* diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index f8def3c8ea4..0e0e99bd6bc 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,5 @@ config UDF_FS tristate "UDF file system support" - depends on BKL # needs serious work to remove select CRC_ITU_T help This is the new file system used on some CD-ROMs and DVDs. Say Y if diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index b608efaa4ce..306ee39ef2c 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb, udf_debug("bit %ld already set\n", bit + i); udf_debug("byte=%2x\n", ((char *)bh->b_data)[(bit + i) >> 3]); - } else { - udf_add_free_space(sb, sbi->s_partition, 1); } } + udf_add_free_space(sb, sbi->s_partition, count); mark_buffer_dirty(bh); if (overflow) { block += count; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 51552bf5022..eb8bfe2b89a 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -30,7 +30,6 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include "udf_i.h" @@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *dir = filp->f_path.dentry->d_inode; int result; - lock_kernel(); - if (filp->f_pos == 0) { if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { - unlock_kernel(); return 0; } filp->f_pos++; } result = do_udf_readdir(dir, filp, filldir, dirent); - unlock_kernel(); return result; } diff --git a/fs/udf/file.c b/fs/udf/file.c index 66b9e7e7e4c..89c78486cbb 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -32,7 +32,6 @@ #include <linux/string.h> /* memset */ #include <linux/capability.h> #include <linux/errno.h> -#include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/aio.h> @@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t count = iocb->ki_left; struct udf_inode_info *iinfo = UDF_I(inode); + down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (file->f_flags & O_APPEND) pos = inode->i_size; @@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, udf_expand_file_adinicb(inode, pos + count, &err); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { udf_debug("udf_expand_adinicb: err=%d\n", err); + up_write(&iinfo->i_data_sem); return err; } } else { @@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, iinfo->i_lenAlloc = inode->i_size; } } + up_write(&iinfo->i_data_sem); retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); if (retval > 0) @@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) long old_block, new_block; int result = -EINVAL; - lock_kernel(); - if (file_permission(filp, MAY_READ) != 0) { udf_debug("no permission to access inode %lu\n", inode->i_ino); result = -EPERM; @@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } out: - unlock_kernel(); return result; } @@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { mutex_lock(&inode->i_mutex); - lock_kernel(); + down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); - unlock_kernel(); + up_write(&UDF_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } return 0; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 75d9304d0dc..6fb7e0adcda 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) return NULL; } - mutex_lock(&sbi->s_alloc_mutex); if (sbi->s_lvid_bh) { - struct logicalVolIntegrityDesc *lvid = - (struct logicalVolIntegrityDesc *) - sbi->s_lvid_bh->b_data; - struct logicalVolIntegrityDescImpUse *lvidiu = - udf_sb_lvidiu(sbi); - struct logicalVolHeaderDesc *lvhd; - uint64_t uniqueID; - lvhd = (struct logicalVolHeaderDesc *) - (lvid->logicalVolContentsUse); + struct logicalVolIntegrityDescImpUse *lvidiu; + + iinfo->i_unique = lvid_get_unique_id(sb); + mutex_lock(&sbi->s_alloc_mutex); + lvidiu = udf_sb_lvidiu(sbi); if (S_ISDIR(mode)) le32_add_cpu(&lvidiu->numDirs, 1); else le32_add_cpu(&lvidiu->numFiles, 1); - iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID); - if (!(++uniqueID & 0x00000000FFFFFFFFUL)) - uniqueID += 16; - lvhd->uniqueID = cpu_to_le64(uniqueID); udf_updated_lvid(sb); + mutex_unlock(&sbi->s_alloc_mutex); } - mutex_unlock(&sbi->s_alloc_mutex); inode_init_owner(inode, dir, mode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index fc48f37aa2d..c6a2e782b97 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -31,7 +31,6 @@ #include "udfdecl.h" #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> @@ -51,6 +50,7 @@ MODULE_LICENSE("GPL"); static mode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static void udf_fill_inode(struct inode *, struct buffer_head *); +static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, sector_t *, int *); @@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode) want_delete = 1; inode->i_size = 0; udf_truncate(inode); - lock_kernel(); udf_update_inode(inode, IS_SYNC(inode)); - unlock_kernel(); } invalidate_inode_buffers(inode); end_writeback(inode); @@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode) kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; if (want_delete) { - lock_kernel(); udf_free_inode(inode); - unlock_kernel(); } } @@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block, err = -EIO; new = 0; bh = NULL; - - lock_kernel(); - iinfo = UDF_I(inode); + + down_write(&iinfo->i_data_sem); if (block == iinfo->i_next_alloc_block + 1) { iinfo->i_next_alloc_block++; iinfo->i_next_alloc_goal++; @@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block, map_bh(bh_result, inode->i_sb, phys); abort: - unlock_kernel(); + up_write(&iinfo->i_data_sem); return err; } @@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - lock_kernel(); iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + inode->i_size)) { udf_expand_file_adinicb(inode, inode->i_size, &err); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { inode->i_size = iinfo->i_lenAlloc; - unlock_kernel(); + up_write(&iinfo->i_data_sem); return; } else udf_truncate_extents(inode); @@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode) offset - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = inode->i_size; } + up_write(&iinfo->i_data_sem); } else { block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block); + down_write(&iinfo->i_data_sem); udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); } inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); @@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode) udf_sync_inode(inode); else mark_inode_dirty(inode); - unlock_kernel(); } static void __udf_read_inode(struct inode *inode) @@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) return; } + read_lock(&sbi->s_cred_lock); inode->i_uid = le32_to_cpu(fe->uid); if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || @@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; - inode->i_nlink = le16_to_cpu(fe->fileLinkCount); - if (!inode->i_nlink) - inode->i_nlink = 1; - - inode->i_size = le64_to_cpu(fe->informationLength); - iinfo->i_lenExtents = inode->i_size; - if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_fmode; @@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) else inode->i_mode = udf_convert_permissions(fe); inode->i_mode &= ~sbi->s_umask; + read_unlock(&sbi->s_cred_lock); + + inode->i_nlink = le16_to_cpu(fe->fileLinkCount); + if (!inode->i_nlink) + inode->i_nlink = 1; + + inode->i_size = le64_to_cpu(fe->informationLength); + iinfo->i_lenExtents = inode->i_size; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << @@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe) int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - - lock_kernel(); - ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); - unlock_kernel(); - - return ret; + return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int udf_sync_inode(struct inode *inode) +static int udf_sync_inode(struct inode *inode) { return udf_update_inode(inode, 1); } @@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block) struct extent_position epos = {}; int ret; - lock_kernel(); + down_read(&UDF_I(inode)->i_data_sem); if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) @@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block) else ret = 0; - unlock_kernel(); + up_read(&UDF_I(inode)->i_data_sem); brelse(epos.bh); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 6d8dc02baeb..2be0f9eb86d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -27,7 +27,6 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include <linux/crc-itu-t.h> @@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && - isdotdot) { - brelse(epos.bh); - return fi; - } + isdotdot) + goto out_ok; if (!lfi) continue; @@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > UDF_NAME_LEN - 2) return ERR_PTR(-ENAMETOOLONG); - lock_kernel(); #ifdef UDF_RECOVERY /* temporary shorthand for specifying files by inode number */ if (!strncmp(dentry->d_name.name, ".B=", 3)) { @@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, }; inode = udf_iget(dir->i_sb, lb); if (!inode) { - unlock_kernel(); return ERR_PTR(-EACCES); } } else @@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); if (!inode) { - unlock_kernel(); return ERR_PTR(-EACCES); } } - unlock_kernel(); return d_splice_alias(inode, dentry); } @@ -476,15 +469,19 @@ add: f_pos >> dir->i_sb->s_blocksize_bits, 1, err); if (!fibh->ebh) goto out_err; + /* Extents could have been merged, invalidate our position */ + brelse(epos.bh); + epos.bh = NULL; + epos.block = dinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(dir); if (!fibh->soffset) { - if (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) { - block = eloc.logicalBlockNum + ((elen - 1) >> + /* Find the freshly allocated block */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + block = eloc.logicalBlockNum + ((elen - 1) >> dir->i_sb->s_blocksize_bits); - } else - block++; - brelse(fibh->sbh); fibh->sbh = fibh->ebh; fi = (struct fileIdentDesc *)(fibh->sbh->b_data); @@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; - lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { - unlock_kernel(); return err; } @@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_nlink--; mark_inode_dirty(inode); iput(inode); - unlock_kernel(); return err; } cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - unlock_kernel(); d_instantiate(dentry, inode); return 0; @@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; - lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); if (!inode) @@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, inode->i_nlink--; mark_inode_dirty(inode); iput(inode); - unlock_kernel(); return err; } cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, err = 0; out: - unlock_kernel(); return err; } @@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) goto out; @@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = 0; out: - unlock_kernel(); return err; } @@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct kernel_lb_addr tloc; retval = -ENOENT; - lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) goto out; @@ -826,7 +813,6 @@ end_rmdir: brelse(fibh.sbh); out: - unlock_kernel(); return retval; } @@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct kernel_lb_addr tloc; retval = -ENOENT; - lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) goto out; @@ -870,7 +855,6 @@ end_unlink: brelse(fibh.sbh); out: - unlock_kernel(); return retval; } @@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, int block; unsigned char *name = NULL; int namelen; - struct buffer_head *bh; struct udf_inode_info *iinfo; + struct super_block *sb = dir->i_sb; - lock_kernel(); inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); if (!inode) goto out; + iinfo = UDF_I(inode); + down_write(&iinfo->i_data_sem); name = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!name) { err = -ENOMEM; goto out_no_entry; } - iinfo = UDF_I(inode); inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; @@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct kernel_lb_addr eloc; uint32_t bsize; - block = udf_new_block(inode->i_sb, inode, + block = udf_new_block(sb, inode, iinfo->i_location.partitionReferenceNum, iinfo->i_location.logicalBlockNum, &err); if (!block) @@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, eloc.logicalBlockNum = block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; - bsize = inode->i_sb->s_blocksize; + bsize = sb->s_blocksize; iinfo->i_lenExtents = bsize; udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); - block = udf_get_pblock(inode->i_sb, block, + block = udf_get_pblock(sb, block, iinfo->i_location.partitionReferenceNum, 0); - epos.bh = udf_tgetblk(inode->i_sb, block); + epos.bh = udf_tgetblk(sb, block); lock_buffer(epos.bh); - memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); + memset(epos.bh->b_data, 0x00, bsize); set_buffer_uptodate(epos.bh); unlock_buffer(epos.bh); mark_buffer_dirty_inode(epos.bh, inode); @@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } else ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; - eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode); + eoffset = sb->s_blocksize - udf_ext0_offset(inode); pc = (struct pathComponent *)ea; if (*symname == '/') { @@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } if (pc->componentType == 5) { - namelen = udf_put_filename(inode->i_sb, compstart, name, + namelen = udf_put_filename(sb, compstart, name, symname - compstart); if (!namelen) goto out_no_entry; @@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) goto out_no_entry; - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + cfi.icb.extLength = cpu_to_le32(sb->s_blocksize); cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - bh = UDF_SB(inode->i_sb)->s_lvid_bh; - if (bh) { - struct logicalVolIntegrityDesc *lvid = - (struct logicalVolIntegrityDesc *)bh->b_data; - struct logicalVolHeaderDesc *lvhd; - uint64_t uniqueID; - lvhd = (struct logicalVolHeaderDesc *) - lvid->logicalVolContentsUse; - uniqueID = le64_to_cpu(lvhd->uniqueID); + if (UDF_SB(inode->i_sb)->s_lvid_bh) { *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); - if (!(++uniqueID & 0x00000000FFFFFFFFUL)) - uniqueID += 16; - lvhd->uniqueID = cpu_to_le64(uniqueID); - mark_buffer_dirty(bh); + cpu_to_le32(lvid_get_unique_id(sb)); } udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(dir); + up_write(&iinfo->i_data_sem); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); @@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, out: kfree(name); - unlock_kernel(); return err; out_no_entry: + up_write(&iinfo->i_data_sem); inode_dec_link_count(inode); iput(inode); goto out; @@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; - struct buffer_head *bh; - lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { - unlock_kernel(); return -EMLINK; } fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - unlock_kernel(); return err; } cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); - bh = UDF_SB(inode->i_sb)->s_lvid_bh; - if (bh) { - struct logicalVolIntegrityDesc *lvid = - (struct logicalVolIntegrityDesc *)bh->b_data; - struct logicalVolHeaderDesc *lvhd; - uint64_t uniqueID; - lvhd = (struct logicalVolHeaderDesc *) - (lvid->logicalVolContentsUse); - uniqueID = le64_to_cpu(lvhd->uniqueID); + if (UDF_SB(inode->i_sb)->s_lvid_bh) { *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); - if (!(++uniqueID & 0x00000000FFFFFFFFUL)) - uniqueID += 16; - lvhd->uniqueID = cpu_to_le64(uniqueID); - mark_buffer_dirty(bh); + cpu_to_le32(lvid_get_unique_id(inode->i_sb)); } udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) @@ -1103,7 +1060,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, mark_inode_dirty(inode); ihold(inode); d_instantiate(dentry, inode); - unlock_kernel(); return 0; } @@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); - lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { if (ofibh.sbh != ofibh.ebh) @@ -1248,7 +1203,6 @@ end_rename: brelse(nfibh.ebh); brelse(nfibh.sbh); } - unlock_kernel(); return retval; } @@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child) struct fileIdentDesc cfi; struct udf_fileident_bh fibh; - lock_kernel(); if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) goto out_unlock; @@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child) inode = udf_iget(child->d_inode->i_sb, &tloc); if (!inode) goto out_unlock; - unlock_kernel(); return d_obtain_alias(inode); out_unlock: - unlock_kernel(); return ERR_PTR(-EACCES); } diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 745eb209be0..a71090ea0e0 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/buffer_head.h> +#include <linux/mutex.h> uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, uint16_t partition, uint32_t offset) @@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) struct udf_sb_info *sbi = UDF_SB(sb); u16 reallocationTableLen; struct buffer_head *bh; + int ret = 0; + mutex_lock(&sbi->s_alloc_mutex); for (i = 0; i < sbi->s_partitions; i++) { struct udf_part_map *map = &sbi->s_partmaps[i]; if (old_block > map->s_partition_root && @@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) break; } - if (!st) - return 1; + if (!st) { + ret = 1; + goto out; + } reallocationTableLen = le16_to_cpu(st->reallocationTableLen); @@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) ((old_block - map->s_partition_root) & (sdata->s_packet_len - 1)); - return 0; + ret = 0; + goto out; } else if (origLoc == packet) { *new_block = le32_to_cpu( entry->mappedLocation) + ((old_block - map->s_partition_root) & (sdata->s_packet_len - 1)); - return 0; + ret = 0; + goto out; } else if (origLoc > packet) break; } @@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) st->mapEntry[k].mappedLocation) + ((old_block - map->s_partition_root) & (sdata->s_packet_len - 1)); - return 0; + ret = 0; + goto out; } - return 1; + ret = 1; + goto out; } /* if old_block */ } if (i == sbi->s_partitions) { /* outside of partitions */ /* for now, fail =) */ - return 1; + ret = 1; } - return 0; +out: + mutex_unlock(&sbi->s_alloc_mutex); + return ret; } static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, diff --git a/fs/udf/super.c b/fs/udf/super.c index 4a5c7c61836..7b27b063ff6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -48,7 +48,6 @@ #include <linux/stat.h> #include <linux/cdrom.h> #include <linux/nls.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/vmalloc.h> @@ -135,15 +134,23 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei->i_next_alloc_block = 0; ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; + init_rwsem(&ei->i_data_sem); return &ei->vfs_inode; } -static void udf_destroy_inode(struct inode *inode) +static void udf_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } +static void udf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, udf_i_callback); +} + static void init_once(void *foo) { struct udf_inode_info *ei = (struct udf_inode_info *)foo; @@ -567,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) if (!udf_parse_options(options, &uopt, true)) return -EINVAL; - lock_kernel(); + write_lock(&sbi->s_cred_lock); sbi->s_flags = uopt.flags; sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; sbi->s_umask = uopt.umask; sbi->s_fmode = uopt.fmode; sbi->s_dmode = uopt.dmode; + write_unlock(&sbi->s_cred_lock); if (sbi->s_lvid_bh) { int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); @@ -590,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) udf_open_lvid(sb); out_unlock: - unlock_kernel(); return error; } @@ -959,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) (sizeof(struct buffer_head *) * nr_groups); if (size <= PAGE_SIZE) - bitmap = kmalloc(size, GFP_KERNEL); + bitmap = kzalloc(size, GFP_KERNEL); else - bitmap = vmalloc(size); /* TODO: get rid of vmalloc */ + bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ if (bitmap == NULL) { udf_error(sb, __func__, @@ -970,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) return NULL; } - memset(bitmap, 0x00, size); bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); bitmap->s_nr_groups = nr_groups; return bitmap; @@ -1774,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb) if (!bh) return; + + mutex_lock(&sbi->s_alloc_mutex); lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvidiu = udf_sb_lvidiu(sbi); @@ -1790,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb) lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; + mutex_unlock(&sbi->s_alloc_mutex); } static void udf_close_lvid(struct super_block *sb) @@ -1802,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb) if (!bh) return; + mutex_lock(&sbi->s_alloc_mutex); lvid = (struct logicalVolIntegrityDesc *)bh->b_data; lvidiu = udf_sb_lvidiu(sbi); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1822,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb) lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; + mutex_unlock(&sbi->s_alloc_mutex); +} + +u64 lvid_get_unique_id(struct super_block *sb) +{ + struct buffer_head *bh; + struct udf_sb_info *sbi = UDF_SB(sb); + struct logicalVolIntegrityDesc *lvid; + struct logicalVolHeaderDesc *lvhd; + u64 uniqueID; + u64 ret; + + bh = sbi->s_lvid_bh; + if (!bh) + return 0; + + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse; + + mutex_lock(&sbi->s_alloc_mutex); + ret = uniqueID = le64_to_cpu(lvhd->uniqueID); + if (!(++uniqueID & 0xFFFFFFFF)) + uniqueID += 16; + lvhd->uniqueID = cpu_to_le64(uniqueID); + mutex_unlock(&sbi->s_alloc_mutex); + mark_buffer_dirty(bh); + + return ret; } static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) @@ -1879,8 +1917,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) struct kernel_lb_addr rootdir, fileset; struct udf_sb_info *sbi; - lock_kernel(); - uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); uopt.uid = -1; uopt.gid = -1; @@ -1889,10 +1925,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) uopt.dmode = UDF_INVALID_MODE; sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); - if (!sbi) { - unlock_kernel(); + if (!sbi) return -ENOMEM; - } sb->s_fs_info = sbi; @@ -1929,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sbi->s_fmode = uopt.fmode; sbi->s_dmode = uopt.dmode; sbi->s_nls_map = uopt.nls_map; + rwlock_init(&sbi->s_cred_lock); if (uopt.session == 0xFFFFFFFF) sbi->s_session = udf_get_last_session(sb); @@ -2038,7 +2073,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) goto error_out; } sb->s_maxbytes = MAX_LFS_FILESIZE; - unlock_kernel(); return 0; error_out: @@ -2059,7 +2093,6 @@ error_out: kfree(sbi); sb->s_fs_info = NULL; - unlock_kernel(); return -EINVAL; } @@ -2098,8 +2131,6 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); - lock_kernel(); - if (sbi->s_vat_inode) iput(sbi->s_vat_inode); if (sbi->s_partitions) @@ -2115,8 +2146,6 @@ static void udf_put_super(struct super_block *sb) kfree(sbi->s_partmaps); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } static int udf_sync_fs(struct super_block *sb, int wait) @@ -2179,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, uint16_t ident; struct spaceBitmapDesc *bm; - lock_kernel(); - loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; bh = udf_read_ptagged(sb, &loc, 0, &ident); @@ -2217,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, } } brelse(bh); - out: - unlock_kernel(); - return accum; } @@ -2233,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb, int8_t etype; struct extent_position epos; - lock_kernel(); - + mutex_lock(&UDF_SB(sb)->s_alloc_mutex); epos.block = UDF_I(table)->i_location; epos.offset = sizeof(struct unallocSpaceEntry); epos.bh = NULL; @@ -2243,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb, accum += (elen >> table->i_sb->s_blocksize_bits); brelse(epos.bh); - - unlock_kernel(); + mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); return accum; } diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 16064787d2b..b1d4488b0f1 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -27,7 +27,6 @@ #include <linux/mm.h> #include <linux/stat.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include "udf_i.h" @@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page) int err = -EIO; unsigned char *p = kmap(page); struct udf_inode_info *iinfo; + uint32_t pos; - lock_kernel(); iinfo = UDF_I(inode); + pos = udf_block_map(inode, 0); + + down_read(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; } else { - bh = sb_bread(inode->i_sb, udf_block_map(inode, 0)); + bh = sb_bread(inode->i_sb, pos); if (!bh) goto out; @@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page) udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); brelse(bh); - unlock_kernel(); + up_read(&iinfo->i_data_sem); SetPageUptodate(page); kunmap(page); unlock_page(page); return 0; out: - unlock_kernel(); + up_read(&iinfo->i_data_sem); SetPageError(page); kunmap(page); unlock_page(page); diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index e58d1de4107..d1bd31ea724 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -1,6 +1,18 @@ #ifndef _UDF_I_H #define _UDF_I_H +/* + * The i_data_sem and i_mutex serve for protection of allocation information + * of a regular files and symlinks. This includes all extents belonging to + * the file/symlink, a fact whether data are in-inode or in external data + * blocks, preallocation, goal block information... When extents are read, + * i_mutex or i_data_sem must be held (for reading is enough in case of + * i_data_sem). When extents are changed, i_data_sem must be held for writing + * and also i_mutex must be held. + * + * For directories i_mutex is used for all the necessary protection. + */ + struct udf_inode_info { struct timespec i_crtime; /* Physical address of inode */ @@ -21,6 +33,7 @@ struct udf_inode_info { struct long_ad *i_lad; __u8 *i_data; } i_ext; + struct rw_semaphore i_data_sem; struct inode vfs_inode; }; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index d113b72c276..4858c191242 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -2,6 +2,7 @@ #define __LINUX_UDF_SB_H #include <linux/mutex.h> +#include <linux/bitops.h> /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 @@ -128,6 +129,8 @@ struct udf_sb_info { uid_t s_uid; mode_t s_fmode; mode_t s_dmode; + /* Lock protecting consistency of above permission settings */ + rwlock_t s_cred_lock; /* Root Info */ struct timespec s_record_time; @@ -139,7 +142,7 @@ struct udf_sb_info { __u16 s_udfrev; /* Miscellaneous flags */ - __u32 s_flags; + unsigned long s_flags; /* Encoding info */ struct nls_table *s_nls_map; @@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi); int udf_compute_nr_groups(struct super_block *sb, u32 partition); -#define UDF_QUERY_FLAG(X,Y) ( UDF_SB(X)->s_flags & ( 1 << (Y) ) ) -#define UDF_SET_FLAG(X,Y) ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) ) -#define UDF_CLEAR_FLAG(X,Y) ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) ) +static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag) +{ + return test_bit(flag, &UDF_SB(sb)->s_flags); +} + +static inline void UDF_SET_FLAG(struct super_block *sb, int flag) +{ + set_bit(flag, &UDF_SB(sb)->s_flags); +} + +static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag) +{ + clear_bit(flag, &UDF_SB(sb)->s_flags); +} #endif /* __LINUX_UDF_SB_H */ diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 6995ab1f430..eba48209f9f 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -111,6 +111,8 @@ struct extent_position { }; /* super.c */ + +__attribute__((format(printf, 3, 4))) extern void udf_warning(struct super_block *, const char *, const char *, ...); static inline void udf_updated_lvid(struct super_block *sb) { @@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb) sb->s_dirt = 1; UDF_SB(sb)->s_lvid_dirty = 1; } +extern u64 lvid_get_unique_id(struct super_block *sb); /* namei.c */ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, @@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, extern long udf_ioctl(struct file *, unsigned int, unsigned long); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); -extern int udf_sync_inode(struct inode *); extern void udf_expand_file_adinicb(struct inode *, int, int *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 2c47daed56d..2c61ac5d4e4 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ufs_destroy_inode(struct inode *inode) +static void ufs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } +static void ufs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ufs_i_callback); +} + static void init_once(void *foo) { struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0dce969d6ca..faca4499709 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ kmem.o \ xfs_aops.o \ xfs_buf.o \ + xfs_discard.o \ xfs_export.o \ xfs_file.o \ xfs_fs_subr.o \ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h deleted file mode 100644 index 4dfc7c37081..00000000000 --- a/fs/xfs/linux-2.6/sv.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SV_H__ -#define __XFS_SUPPORT_SV_H__ - -#include <linux/wait.h> -#include <linux/sched.h> -#include <linux/spinlock.h> - -/* - * Synchronisation variables. - * - * (Parameters "pri", "svf" and "rts" are not implemented) - */ - -typedef struct sv_s { - wait_queue_head_t waiters; -} sv_t; - -static inline void _sv_wait(sv_t *sv, spinlock_t *lock) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&sv->waiters, &wait); - __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(lock); - - schedule(); - - remove_wait_queue(&sv->waiters, &wait); -} - -#define sv_init(sv,flag,name) \ - init_waitqueue_head(&(sv)->waiters) -#define sv_destroy(sv) \ - /*NOTHING*/ -#define sv_wait(sv, pri, lock, s) \ - _sv_wait(sv, lock) -#define sv_signal(sv) \ - wake_up(&(sv)->waiters) -#define sv_broadcast(sv) \ - wake_up_all(&(sv)->waiters) - -#endif /* __XFS_SUPPORT_SV_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index b2771862fd3..39f4f809bb6 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } int -xfs_check_acl(struct inode *inode, int mask) +xfs_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip; struct posix_acl *acl; int error = -EAGAIN; + ip = XFS_I(inode); trace_xfs_check_acl(ip); /* @@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask) if (!XFS_IFORK_Q(ip)) return -EAGAIN; + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 691f61223ed..ec7bbb5645b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -38,15 +38,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> -/* - * Types of I/O for bmap clustering and I/O completion tracking. - */ -enum { - IO_READ, /* mapping for a read */ - IO_DELAY, /* mapping covers delalloc region */ - IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ - IO_NEW /* just allocated */ -}; /* * Prime number of hash buckets since address is used as the key. @@ -182,9 +173,6 @@ xfs_setfilesize( xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IO_READ); - if (unlikely(ioend->io_error)) return 0; @@ -244,10 +232,8 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IO_READ) { - error = xfs_setfilesize(ioend); - ASSERT(!error || error == EAGAIN); - } + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); /* * If we didn't complete processing of the ioend, requeue it to the @@ -318,14 +304,63 @@ STATIC int xfs_map_blocks( struct inode *inode, loff_t offset, - ssize_t count, struct xfs_bmbt_irec *imap, - int flags) + int type, + int nonblocking) { - int nmaps = 1; - int new = 0; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t count = 1 << inode->i_blkbits; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (type == IO_UNWRITTEN) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (nonblocking) + return -XFS_ERROR(EAGAIN); + xfs_ilock(ip, XFS_ILOCK_SHARED); + } - return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(offset <= mp->m_maxioffset); + + if (offset + count > mp->m_maxioffset) + count = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, + bmapi_flags, NULL, 0, imap, &nimaps, NULL); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (error) + return -XFS_ERROR(error); + + if (type == IO_DELALLOC && + (!nimaps || isnullstartblock(imap->br_startblock))) { + error = xfs_iomap_write_allocate(ip, offset, count, imap); + if (!error) + trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); + return -XFS_ERROR(error); + } + +#ifdef DEBUG + if (type == IO_UNWRITTEN) { + ASSERT(nimaps); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + } +#endif + if (nimaps) + trace_xfs_map_blocks_found(ip, offset, count, type, imap); + return 0; } STATIC int @@ -380,26 +415,18 @@ xfs_submit_ioend_bio( submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); - ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); - bio_put(bio); } STATIC struct bio * xfs_alloc_ioend_bio( struct buffer_head *bh) { - struct bio *bio; int nvecs = bio_get_nr_vecs(bh->b_bdev); - - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (!bio); + struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio_get(bio); return bio; } @@ -470,9 +497,8 @@ xfs_submit_ioend( /* Pass 1 - start writeback */ do { next = ioend->io_list; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) xfs_start_buffer_writeback(bh); - } } while ((ioend = next) != NULL); /* Pass 2 - submit I/O */ @@ -600,117 +626,13 @@ xfs_map_at_offset( ASSERT(imap->br_startblock != HOLESTARTBLOCK); ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - lock_buffer(bh); xfs_map_buffer(inode, bh, imap, offset); - bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); } /* - * Look for a page at index that is suitable for clustering. - */ -STATIC unsigned int -xfs_probe_page( - struct page *page, - unsigned int pg_offset) -{ - struct buffer_head *bh, *head; - int ret = 0; - - if (PageWriteback(page)) - return 0; - if (!PageDirty(page)) - return 0; - if (!page->mapping) - return 0; - if (!page_has_buffers(page)) - return 0; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (!buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - - return ret; -} - -STATIC size_t -xfs_probe_cluster( - struct inode *inode, - struct page *startpage, - struct buffer_head *bh, - struct buffer_head *head) -{ - struct pagevec pvec; - pgoff_t tindex, tlast, tloff; - size_t total = 0; - int done = 0, i; - - /* First sum forwards in this page */ - do { - if (!buffer_uptodate(bh) || !buffer_mapped(bh)) - return total; - total += bh->b_size; - } while ((bh = bh->b_this_page) != head); - - /* if we reached the end of the page, sum forwards in following pages */ - tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; - tindex = startpage->index + 1; - - /* Prune this back to avoid pathological behavior */ - tloff = min(tlast, startpage->index + 64); - - pagevec_init(&pvec, 0); - while (!done && tindex <= tloff) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t pg_offset, pg_len = 0; - - if (tindex == tlast) { - pg_offset = - i_size_read(inode) & (PAGE_CACHE_SIZE - 1); - if (!pg_offset) { - done = 1; - break; - } - } else - pg_offset = PAGE_CACHE_SIZE; - - if (page->index == tindex && trylock_page(page)) { - pg_len = xfs_probe_page(page, pg_offset); - unlock_page(page); - } - - if (!pg_len) { - done = 1; - break; - } - - total += pg_len; - tindex++; - } - - pagevec_release(&pvec); - cond_resched(); - } - - return total; -} - -/* * Test if a given page is suitable for writing as part of an unwritten * or delayed allocate extent. */ @@ -731,9 +653,9 @@ xfs_is_delayed_page( if (buffer_unwritten(bh)) acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IO_DELAY); + acceptable = (type == IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IO_NEW); + acceptable = (type == IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -758,8 +680,7 @@ xfs_convert_page( loff_t tindex, struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, - struct writeback_control *wbc, - int all_bh) + struct writeback_control *wbc) { struct buffer_head *bh, *head; xfs_off_t end_offset; @@ -814,37 +735,30 @@ xfs_convert_page( continue; } - if (buffer_unwritten(bh) || buffer_delay(bh)) { + if (buffer_unwritten(bh) || buffer_delay(bh) || + buffer_mapped(bh)) { if (buffer_unwritten(bh)) type = IO_UNWRITTEN; + else if (buffer_delay(bh)) + type = IO_DELALLOC; else - type = IO_DELAY; + type = IO_OVERWRITE; if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(imap->br_startblock != HOLESTARTBLOCK); - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - - xfs_map_at_offset(inode, bh, imap, offset); + lock_buffer(bh); + if (type != IO_OVERWRITE) + xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); page_dirty--; count++; } else { - type = IO_NEW; - if (buffer_mapped(bh) && all_bh) { - lock_buffer(bh); - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - count++; - page_dirty--; - } else { - done = 1; - } + done = 1; } } while (offset += len, (bh = bh->b_this_page) != head); @@ -876,7 +790,6 @@ xfs_cluster_write( struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int all_bh, pgoff_t tlast) { struct pagevec pvec; @@ -891,7 +804,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - imap, ioendp, wbc, all_bh); + imap, ioendp, wbc); if (done) break; } @@ -935,7 +848,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_is_delayed_page(page, IO_DELAY)) + if (!xfs_is_delayed_page(page, IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1002,10 +915,10 @@ xfs_vm_writepage( unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; - ssize_t size, len; - int flags, err, imap_valid = 0, uptodate = 1; + ssize_t len; + int err, imap_valid = 0, uptodate = 1; int count = 0; - int all_bh = 0; + int nonblocking = 0; trace_xfs_writepage(inode, page, 0); @@ -1056,10 +969,14 @@ xfs_vm_writepage( bh = head = page_buffers(page); offset = page_offset(page); - flags = BMAPI_READ; - type = IO_NEW; + type = IO_OVERWRITE; + + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + nonblocking = 1; do { + int new_ioend = 0; + if (offset >= end_offset) break; if (!buffer_uptodate(bh)) @@ -1076,90 +993,54 @@ xfs_vm_writepage( continue; } - if (imap_valid) - imap_valid = xfs_imap_valid(inode, &imap, offset); - - if (buffer_unwritten(bh) || buffer_delay(bh)) { - int new_ioend = 0; - - /* - * Make sure we don't use a read-only iomap - */ - if (flags == BMAPI_READ) - imap_valid = 0; - - if (buffer_unwritten(bh)) { + if (buffer_unwritten(bh)) { + if (type != IO_UNWRITTEN) { type = IO_UNWRITTEN; - flags = BMAPI_WRITE | BMAPI_IGNSTATE; - } else if (buffer_delay(bh)) { - type = IO_DELAY; - flags = BMAPI_ALLOCATE; - - if (wbc->sync_mode == WB_SYNC_NONE) - flags |= BMAPI_TRYLOCK; - } - - if (!imap_valid) { - /* - * If we didn't have a valid mapping then we - * need to ensure that we put the new mapping - * in a new ioend structure. This needs to be - * done to ensure that the ioends correctly - * reflect the block mappings at io completion - * for unwritten extent conversion. - */ - new_ioend = 1; - err = xfs_map_blocks(inode, offset, len, - &imap, flags); - if (err) - goto error; - imap_valid = xfs_imap_valid(inode, &imap, - offset); + imap_valid = 0; } - if (imap_valid) { - xfs_map_at_offset(inode, bh, &imap, offset); - xfs_add_to_ioend(inode, bh, offset, type, - &ioend, new_ioend); - count++; + } else if (buffer_delay(bh)) { + if (type != IO_DELALLOC) { + type = IO_DELALLOC; + imap_valid = 0; } } else if (buffer_uptodate(bh)) { - /* - * we got here because the buffer is already mapped. - * That means it must already have extents allocated - * underneath it. Map the extent by reading it. - */ - if (!imap_valid || flags != BMAPI_READ) { - flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, head); - err = xfs_map_blocks(inode, offset, size, - &imap, flags); - if (err) - goto error; - imap_valid = xfs_imap_valid(inode, &imap, - offset); + if (type != IO_OVERWRITE) { + type = IO_OVERWRITE; + imap_valid = 0; } + } else { + if (PageUptodate(page)) { + ASSERT(buffer_mapped(bh)); + imap_valid = 0; + } + continue; + } + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); + if (!imap_valid) { /* - * We set the type to IO_NEW in case we are doing a - * small write at EOF that is extending the file but - * without needing an allocation. We need to update the - * file size on I/O completion in this case so it is - * the same case as having just allocated a new extent - * that we are writing into for the first time. + * If we didn't have a valid mapping then we need to + * put the new mapping into a separate ioend structure. + * This ensures non-contiguous extents always have + * separate ioends, which is particularly important + * for unwritten extent conversion at I/O completion + * time. */ - type = IO_NEW; - if (trylock_buffer(bh)) { - if (imap_valid) - all_bh = 1; - xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !imap_valid); - count++; - } else { - imap_valid = 0; - } - } else if (PageUptodate(page)) { - ASSERT(buffer_mapped(bh)); - imap_valid = 0; + new_ioend = 1; + err = xfs_map_blocks(inode, offset, &imap, type, + nonblocking); + if (err) + goto error; + imap_valid = xfs_imap_valid(inode, &imap, offset); + } + if (imap_valid) { + lock_buffer(bh); + if (type != IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, &ioend, + new_ioend); + count++; } if (!iohead) @@ -1188,7 +1069,7 @@ xfs_vm_writepage( end_index = last_index; xfs_cluster_write(inode, page->index + 1, &imap, &ioend, - wbc, all_bh, end_index); + wbc, end_index); } if (iohead) @@ -1257,13 +1138,19 @@ __xfs_get_blocks( int create, int direct) { - int flags = create ? BMAPI_WRITE : BMAPI_READ; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; struct xfs_bmbt_irec imap; + int nimaps = 1; xfs_off_t offset; ssize_t size; - int nimap = 1; int new = 0; - int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); @@ -1272,15 +1159,45 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - if (direct && create) - flags |= BMAPI_DIRECT; + if (create) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); + } else { + lockmode = xfs_ilock_map_shared(ip); + } + + ASSERT(offset <= mp->m_maxioffset); + if (offset + size > mp->m_maxioffset) + size = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); + offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, - &new); + error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); if (error) - return -error; - if (nimap == 0) - return 0; + goto out_unlock; + + if (create && + (!nimaps || + (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK))) { + if (direct) { + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, size, &imap); + } + if (error) + goto out_unlock; + + trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + } else if (nimaps) { + trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + } else { + trace_xfs_get_blocks_notfound(ip, offset, size); + goto out_unlock; + } + xfs_iunlock(ip, lockmode); if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK) { @@ -1347,6 +1264,10 @@ __xfs_get_blocks( } return 0; + +out_unlock: + xfs_iunlock(ip, lockmode); + return -error; } int @@ -1434,7 +1355,7 @@ xfs_vm_direct_IO( ssize_t ret; if (rw & WRITE) { - iocb->private = xfs_alloc_ioend(inode, IO_NEW); + iocb->private = xfs_alloc_ioend(inode, IO_DIRECT); ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index c5057fb6237..71f721e1a71 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue; extern mempool_t *xfs_ioend_pool; /* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_DIRECT = 0, /* special case for direct I/O ioends */ + IO_DELALLOC, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_OVERWRITE, /* mapping covers already allocated extent */ +}; + +#define XFS_IO_TYPES \ + { 0, "" }, \ + { IO_DELALLOC, "delalloc" }, \ + { IO_UNWRITTEN, "unwritten" }, \ + { IO_OVERWRITE, "overwrite" } + +/* * xfs_ioend struct manages large extent writes for XFS. * It can manage several multi-page bio's at once. */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 4c5deb6e9e3..ac1c7e8378d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -44,12 +44,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); -static struct shrinker xfs_buf_shake = { - .shrink = xfsbufd_wakeup, - .seeks = DEFAULT_SEEKS, -}; static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -168,8 +163,79 @@ test_page_region( } /* - * Internal xfs_buf_t object manipulation + * xfs_buf_lru_add - add a buffer to the LRU. + * + * The LRU takes a new reference to the buffer so that it will only be freed + * once the shrinker takes the buffer off the LRU. */ +STATIC void +xfs_buf_lru_add( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + + spin_lock(&btp->bt_lru_lock); + if (list_empty(&bp->b_lru)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_lru, &btp->bt_lru); + btp->bt_lru_nr++; + } + spin_unlock(&btp->bt_lru_lock); +} + +/* + * xfs_buf_lru_del - remove a buffer from the LRU + * + * The unlocked check is safe here because it only occurs when there are not + * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there + * to optimise the shrinker removing the buffer from the LRU and calling + * xfs_buf_free(). i.e. it removes an unneccessary round trip on the + * bt_lru_lock. + */ +STATIC void +xfs_buf_lru_del( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + + if (list_empty(&bp->b_lru)) + return; + + spin_lock(&btp->bt_lru_lock); + if (!list_empty(&bp->b_lru)) { + list_del_init(&bp->b_lru); + btp->bt_lru_nr--; + } + spin_unlock(&btp->bt_lru_lock); +} + +/* + * When we mark a buffer stale, we remove the buffer from the LRU and clear the + * b_lru_ref count so that the buffer is freed immediately when the buffer + * reference count falls to zero. If the buffer is already on the LRU, we need + * to remove the reference that LRU holds on the buffer. + * + * This prevents build-up of stale buffers on the LRU. + */ +void +xfs_buf_stale( + struct xfs_buf *bp) +{ + bp->b_flags |= XBF_STALE; + atomic_set(&(bp)->b_lru_ref, 0); + if (!list_empty(&bp->b_lru)) { + struct xfs_buftarg *btp = bp->b_target; + + spin_lock(&btp->bt_lru_lock); + if (!list_empty(&bp->b_lru)) { + list_del_init(&bp->b_lru); + btp->bt_lru_nr--; + atomic_dec(&bp->b_hold); + } + spin_unlock(&btp->bt_lru_lock); + } + ASSERT(atomic_read(&bp->b_hold) >= 1); +} STATIC void _xfs_buf_initialize( @@ -186,7 +252,9 @@ _xfs_buf_initialize( memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); + atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ @@ -262,6 +330,8 @@ xfs_buf_free( { trace_xfs_buf_free(bp, _RET_IP_); + ASSERT(list_empty(&bp->b_lru)); + if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; @@ -337,7 +407,6 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -827,7 +896,7 @@ xfs_buf_rele( trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { - ASSERT(!bp->b_relse); + ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) xfs_buf_free(bp); @@ -835,13 +904,15 @@ xfs_buf_rele( } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); + ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (bp->b_relse) { - atomic_inc(&bp->b_hold); + if (!(bp->b_flags & XBF_STALE) && + atomic_read(&bp->b_lru_ref)) { + xfs_buf_lru_add(bp); spin_unlock(&pag->pag_buf_lock); - bp->b_relse(bp); } else { + xfs_buf_lru_del(bp); ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -1438,51 +1509,84 @@ xfs_buf_iomove( */ /* - * Wait for any bufs with callbacks that have been submitted but - * have not yet returned... walk the hash list for the target. + * Wait for any bufs with callbacks that have been submitted but have not yet + * returned. These buffers will have an elevated hold count, so wait on those + * while freeing all the buffers only held by the LRU. */ void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_perag *pag; - uint i; + struct xfs_buf *bp; - for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { - pag = xfs_perag_get(btp->bt_mount, i); - spin_lock(&pag->pag_buf_lock); - while (rb_first(&pag->pag_buf_tree)) { - spin_unlock(&pag->pag_buf_lock); +restart: + spin_lock(&btp->bt_lru_lock); + while (!list_empty(&btp->bt_lru)) { + bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); + if (atomic_read(&bp->b_hold) > 1) { + spin_unlock(&btp->bt_lru_lock); delay(100); - spin_lock(&pag->pag_buf_lock); + goto restart; } - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); + /* + * clear the LRU reference count so the bufer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + spin_unlock(&btp->bt_lru_lock); + xfs_buf_rele(bp); + spin_lock(&btp->bt_lru_lock); } + spin_unlock(&btp->bt_lru_lock); } -/* - * buftarg list for delwrite queue processing - */ -static LIST_HEAD(xfs_buftarg_list); -static DEFINE_SPINLOCK(xfs_buftarg_lock); - -STATIC void -xfs_register_buftarg( - xfs_buftarg_t *btp) +int +xfs_buftarg_shrink( + struct shrinker *shrink, + int nr_to_scan, + gfp_t mask) { - spin_lock(&xfs_buftarg_lock); - list_add(&btp->bt_list, &xfs_buftarg_list); - spin_unlock(&xfs_buftarg_lock); -} + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + struct xfs_buf *bp; + LIST_HEAD(dispose); -STATIC void -xfs_unregister_buftarg( - xfs_buftarg_t *btp) -{ - spin_lock(&xfs_buftarg_lock); - list_del(&btp->bt_list); - spin_unlock(&xfs_buftarg_lock); + if (!nr_to_scan) + return btp->bt_lru_nr; + + spin_lock(&btp->bt_lru_lock); + while (!list_empty(&btp->bt_lru)) { + if (nr_to_scan-- <= 0) + break; + + bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); + + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + list_move_tail(&bp->b_lru, &btp->bt_lru); + continue; + } + + /* + * remove the buffer from the LRU now to avoid needing another + * lock round trip inside xfs_buf_rele(). + */ + list_move(&bp->b_lru, &dispose); + btp->bt_lru_nr--; + } + spin_unlock(&btp->bt_lru_lock); + + while (!list_empty(&dispose)) { + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); + } + + return btp->bt_lru_nr; } void @@ -1490,17 +1594,14 @@ xfs_free_buftarg( struct xfs_mount *mp, struct xfs_buftarg *btp) { + unregister_shrinker(&btp->bt_shrinker); + xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); iput(btp->bt_mapping->host); - /* Unregister the buftarg first so that we don't get a - * wakeup finding a non-existent task - */ - xfs_unregister_buftarg(btp); kthread_stop(btp->bt_task); - kmem_free(btp); } @@ -1597,20 +1698,13 @@ xfs_alloc_delwrite_queue( xfs_buftarg_t *btp, const char *fsname) { - int error = 0; - - INIT_LIST_HEAD(&btp->bt_list); INIT_LIST_HEAD(&btp->bt_delwrite_queue); spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) { - error = PTR_ERR(btp->bt_task); - goto out_error; - } - xfs_register_buftarg(btp); -out_error: - return error; + if (IS_ERR(btp->bt_task)) + return PTR_ERR(btp->bt_task); + return 0; } xfs_buftarg_t * @@ -1627,12 +1721,17 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; + INIT_LIST_HEAD(&btp->bt_lru); + spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; + btp->bt_shrinker.shrink = xfs_buftarg_shrink; + btp->bt_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&btp->bt_shrinker); return btp; error: @@ -1737,27 +1836,6 @@ xfs_buf_runall_queues( flush_workqueue(queue); } -STATIC int -xfsbufd_wakeup( - struct shrinker *shrink, - int priority, - gfp_t mask) -{ - xfs_buftarg_t *btp; - - spin_lock(&xfs_buftarg_lock); - list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { - if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) - continue; - if (list_empty(&btp->bt_delwrite_queue)) - continue; - set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); - wake_up_process(btp->bt_task); - } - spin_unlock(&xfs_buftarg_lock); - return 0; -} - /* * Move as many buffers as specified to the supplied list * idicating if we skipped any buffers to prevent deadlocks. @@ -1952,7 +2030,6 @@ xfs_buf_init(void) if (!xfsconvertd_workqueue) goto out_destroy_xfsdatad_workqueue; - register_shrinker(&xfs_buf_shake); return 0; out_destroy_xfsdatad_workqueue: @@ -1968,7 +2045,6 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - unregister_shrinker(&xfs_buf_shake); destroy_workqueue(xfsconvertd_workqueue); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 383a3f37cf9..cbe65950e52 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -128,10 +128,15 @@ typedef struct xfs_buftarg { /* per device delwri queue */ struct task_struct *bt_task; - struct list_head bt_list; struct list_head bt_delwrite_queue; spinlock_t bt_delwrite_lock; unsigned long bt_flags; + + /* LRU control structures */ + struct shrinker bt_shrinker; + struct list_head bt_lru; + spinlock_t bt_lru_lock; + unsigned int bt_lru_nr; } xfs_buftarg_t; /* @@ -147,8 +152,6 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); -typedef void (*xfs_buf_relse_t)(struct xfs_buf *); -typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 @@ -164,9 +167,11 @@ typedef struct xfs_buf { xfs_off_t b_file_offset; /* offset in file */ size_t b_buffer_length;/* size of buffer in bytes */ atomic_t b_hold; /* reference count */ + atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ + struct list_head b_lru; /* lru list */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ @@ -176,7 +181,6 @@ typedef struct xfs_buf { void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ - xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) -#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) +void xfs_buf_stale(struct xfs_buf *bp); +#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) #define XFS_BUF_SUPER_STALE(bp) do { \ @@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) -#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) @@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) +static inline void +xfs_buf_set_ref( + struct xfs_buf *bp, + int lru_ref) +{ + atomic_set(&bp->b_lru_ref, lru_ref); +} +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) -#define XFS_BUF_SET_REF(bp, ref) do { } while (0) #define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) @@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void); static inline void xfs_buf_relse(xfs_buf_t *bp) { - if (!bp->b_relse) - xfs_buf_unlock(bp); + xfs_buf_unlock(bp); xfs_buf_rele(bp); } diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c new file mode 100644 index 00000000000..05201ae719e --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_discard.h" +#include "xfs_trace.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_fsblock_t start, + xfs_fsblock_t len, + xfs_fsblock_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_le(cur, 0, + XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + + /* + * Too small? Give up. + */ + if (flen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || + XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, + XFS_AGB_TO_DADDR(mp, agno, fbno), + XFS_FSB_TO_BB(mp, flen), + GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_fsblock_t start, len, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * XFS_B_TO_FSB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + start = XFS_B_TO_FSBT(mp, range.start); + len = XFS_B_TO_FSBT(mp, range.len); + minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + + start_agno = XFS_FSB_TO_AGNO(mp, start); + if (start_agno >= mp->m_sb.sb_agcount) + return -XFS_ERROR(EINVAL); + + end_agno = XFS_FSB_TO_AGNO(mp, start + len); + if (end_agno >= mp->m_sb.sb_agcount) + end_agno = mp->m_sb.sb_agcount - 1; + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, len, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h new file mode 100644 index 00000000000..e82b6dd3e12 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -0,0 +1,8 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 3764d74790e..fc0114da7fd 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -70,8 +70,16 @@ xfs_fs_encode_fh( else fileid_type = FILEID_INO32_GEN_PARENT; - /* filesystem may contain 64bit inode numbers */ - if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) + /* + * If the the filesystem may contain 64bit inode numbers, we need + * to use larger file handles that can represent them. + * + * While we only allocate inodes that do not fit into 32 bits any + * large enough filesystem may contain them, thus the slightly + * confusing looking conditional below. + */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || + (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) fileid_type |= XFS_FILEID_TYPE_64FLAG; /* diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ba8ad422a16..a55c1b46b21 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -37,10 +37,45 @@ #include "xfs_trace.h" #include <linux/dcache.h> +#include <linux/falloc.h> static const struct vm_operations_struct xfs_file_vm_ops; /* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* * xfs_iozero * * xfs_iozero clears the specified range of buffer supplied, @@ -262,22 +297,21 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } } - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } else + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -285,7 +319,7 @@ xfs_file_aio_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -309,7 +343,7 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_splice_read(ip, count, *ppos, ioflags); @@ -317,10 +351,61 @@ xfs_file_splice_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occured. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip) +{ + if (ip->i_new_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * xfs_file_splice_write() does not use xfs_rw_ilock() because + * generic_file_splice_write() takes the i_mutex itself. This, in theory, + * couuld cause lock inversions between the aio_write path and the splice path + * if someone is doing concurrent splice(2) based writes and write(2) based + * writes to the same inode. The only real way to fix this is to re-implement + * the generic code here with correct locking orders. + */ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -331,7 +416,7 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize, new_size; + xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -355,27 +440,9 @@ xfs_file_splice_write( trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, ppos, ret); + xfs_aio_write_newsize_update(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -562,247 +629,314 @@ out_lock: return error; } +/* + * Common pre-write limit and setup checks. + * + * Returns with iolock held according to @iolock. + */ STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0, error = 0; - int ioflags = 0; - xfs_fsize_t isize, new_size; - int iolock; - size_t ocount = 0, count; - int need_i_mutex; + xfs_fsize_t new_size; + int error = 0; - XFS_STATS_INC(xs_write_calls); + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return error; + } - BUG_ON(iocb->ki_pos != pos); + new_size = *pos + *count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. + */ + if (*pos > ip->i_size) + error = -xfs_zero_eof(ip, *pos, ip->i_size); - error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; - count = ocount; - if (count == 0) - return 0; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; +} -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + size_t count = ocount; + int unaligned_io = 0; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return -XFS_ERROR(EINVAL); + + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + if (unaligned_io || mapping->nrpages || pos > ip->i_size) + *iolock = XFS_IOLOCK_EXCL; + else + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + + if (mapping->nrpages) { + WARN_ON(*iolock != XFS_IOLOCK_EXCL); + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (ret) + return ret; } - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + xfs_ioend_wait(ip); + else if (*iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + *iolock = XFS_IOLOCK_SHARED; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} - if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + size_t count = ocount; - new_size = pos + count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. + * if we just got an ENOSPC, flush the inode now we aren't holding any + * page locks and retry *once* */ - - if (pos > ip->i_size) { - error = xfs_zero_eof(ip, pos, ip->i_size); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } + if (ret == -ENOSPC && !enospc) { + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) + return ret; + enospc = 1; + goto write_retry; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + current->backing_dev_info = NULL; + return ret; +} - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - error = -file_remove_suid(file); - if (unlikely(error)) - goto out_unlock_internal; +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int iolock; + size_t ocount = 0; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + XFS_STATS_INC(xs_write_calls); - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(ip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } + BUG_ON(iocb->ki_pos != pos); - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } + if (ocount == 0) + return 0; - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; - pos += ret; - count -= ret; + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + else + ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(ip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); -write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } + if (ret <= 0) + goto out_unlock; - current->backing_dev_info = NULL; + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error, error2; - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) - iocb->ki_pos = isize; + xfs_rw_iunlock(ip, iolock); + error = filemap_write_and_wait_range(mapping, pos, end); + xfs_rw_ilock(ip, iolock); - if (iocb->ki_pos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (iocb->ki_pos > ip->i_size) - ip->i_size = iocb->ki_pos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error2 = -xfs_file_fsync(file, + (file->f_flags & __O_SYNC) ? 0 : 1); + if (error) + ret = error; + else if (error2) + ret = error2; } - error = -ret; - if (ret <= 0) - goto out_unlock_internal; +out_unlock: + xfs_aio_write_newsize_update(ip); + xfs_rw_iunlock(ip, iolock); + return ret; +} - XFS_STATS_ADD(xs_write_bytes, ret); +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + int cmd = XFS_IOC_RESVSP; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error2; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); + xfs_ilock(ip, XFS_IOLOCK_EXCL); - error2 = -xfs_file_fsync(file, - (file->f_flags & __O_SYNC) ? 0 : 1); - if (!error) - error = error2; + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = XFS_IOC_UNRESVSP; + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; } - out_unlock_internal: - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } - xfs_iunlock(ip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; } + STATIC int xfs_file_open( struct inode *inode, @@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index ad442d9e392..b06ede1d0be 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" #include "xfs_vnodeops.h" +#include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" @@ -1294,6 +1295,8 @@ xfs_file_ioctl( trace_xfs_file_ioctl(ip); switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 94d5fd6a297..bd5727852fd 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -46,7 +46,6 @@ #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/falloc.h> #include <linux/fiemap.h> #include <linux/slab.h> @@ -505,58 +504,6 @@ xfs_vn_setattr( return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); } -STATIC long -xfs_vn_fallocate( - struct inode *inode, - int mode, - loff_t offset, - loff_t len) -{ - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - - /* preallocation on directories not yet supported */ - error = -ENODEV; - if (S_ISDIR(inode->i_mode)) - goto out_error; - - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, - 0, XFS_ATTR_NOLOCK); - if (error) - goto out_unlock; - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); - } - -out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); -out_error: - return error; -} - #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .fallocate = xfs_vn_fallocate, .fiemap = xfs_vn_fiemap, }; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 214ddd71ff7..09649499774 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -37,7 +37,6 @@ #include <kmem.h> #include <mrlock.h> -#include <sv.h> #include <time.h> #include <support/debug.h> diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 064f964d4f3..9731898083a 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -606,7 +606,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); printk("XFS: Invalid device [%s], error=%d\n", name, error); @@ -620,7 +621,7 @@ xfs_blkdev_put( struct block_device *bdev) { if (bdev) - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } /* @@ -834,8 +835,11 @@ xfsaild_wakeup( struct xfs_ail *ailp, xfs_lsn_t threshold_lsn) { - ailp->xa_target = threshold_lsn; - wake_up_process(ailp->xa_task); + /* only ever move the target forwards */ + if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) { + ailp->xa_target = threshold_lsn; + wake_up_process(ailp->xa_task); + } } STATIC int @@ -847,8 +851,17 @@ xfsaild( long tout = 0; /* milliseconds */ while (!kthread_should_stop()) { - schedule_timeout_interruptible(tout ? - msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + /* + * for short sleeps indicating congestion, don't allow us to + * get woken early. Otherwise all we do is bang on the AIL lock + * without making progress. + */ + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); /* swsusp */ try_to_freeze(); @@ -935,7 +948,7 @@ out_reclaim: * Slab object creation initialisation for the XFS inode. * This covers only the idempotent fields in the XFS inode; * all other fields need to be initialised on allocation - * from the slab. This avoids the need to repeatedly intialise + * from the slab. This avoids the need to repeatedly initialise * fields in the xfs inode that left in the initialise state * when freeing the inode. */ @@ -1118,6 +1131,8 @@ xfs_fs_evict_inode( */ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_reclaimable, "xfs_iolock_reclaimable"); xfs_inactive(ip); } @@ -1399,7 +1414,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp, SYNC_WAIT); + return -xfs_fs_log_dummy(mp); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index afb0d7cfad1..e22f0057d21 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab( { struct inode *inode = VFS_I(ip); + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return EFSCORRUPTED; - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) - return ENOENT; - /* If we can't grab the inode, it must on it's way to reclaim. */ if (!igrab(inode)) return ENOENT; @@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab( /* inode is valid */ return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ENOENT; } STATIC int @@ -98,12 +118,12 @@ restart: int error = 0; int i; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); if (!nr_found) { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); break; } @@ -118,18 +138,26 @@ restart: batch[i] = NULL; /* - * Update the index for the next lookup. Catch overflows - * into the next AG range which can occur if we have inodes - * in the last block of the AG and we are currently - * pointing to the last inode. + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) done = 1; } /* unlock now we've grabbed the inodes. */ - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); for (i = 0; i < nr_found; i++) { if (!batch[i]) @@ -334,7 +362,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -475,13 +503,14 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ - error = xfs_qm_sync(mp, SYNC_TRYLOCK); if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp, 0); + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); + error = xfs_qm_sync(mp, SYNC_TRYLOCK); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); @@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } @@ -639,9 +668,14 @@ xfs_reclaim_inode_grab( struct xfs_inode *ip, int flags) { + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; /* - * do some unlocked checks first to avoid unnecceary lock traffic. + * do some unlocked checks first to avoid unnecessary lock traffic. * The first is a flush lock check, the second is a already in reclaim * check. Only do these checks if we are not going to block on locks. */ @@ -654,11 +688,16 @@ xfs_reclaim_inode_grab( * The radix tree lock here protects a thread in xfs_iget from racing * with us starting reclaim on the inode. Once we have the * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. */ spin_lock(&ip->i_flags_lock); - ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { - /* ignore as it is already under reclaim */ + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ spin_unlock(&ip->i_flags_lock); return 1; } @@ -795,12 +834,12 @@ reclaim: * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate @@ -864,14 +903,14 @@ restart: struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int i; - write_lock(&pag->pag_ici_lock); + rcu_read_lock(); nr_found = radix_tree_gang_lookup_tag( &pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH, XFS_ICI_RECLAIM_TAG); if (!nr_found) { - write_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); break; } @@ -891,14 +930,24 @@ restart: * occur if we have inodes in the last block of * the AG and we are currently pointing to the * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) done = 1; } /* unlock now we've grabbed the inodes. */ - write_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); for (i = 0; i < nr_found; i++) { if (!batch[i]) diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 7bb5092d6ae..ee3cee097e7 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -18,6 +18,7 @@ #include "xfs.h" #include <linux/sysctl.h> #include <linux/proc_fs.h> +#include "xfs_error.h" static struct ctl_table_header *xfs_table_header; @@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler( return ret; } + +STATIC int +xfs_panic_mask_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} #endif /* CONFIG_PROC_FS */ static ctl_table xfs_table[] = { @@ -77,7 +98,7 @@ static ctl_table xfs_table[] = { .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_panic_mask_proc_handler, .extra1 = &xfs_params.panic_mask.min, .extra2 = &xfs_params.panic_mask.max }, diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index acef2e98c59..2d0bcb47907 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __field(int, curr_res) __field(int, unit_res) __field(unsigned int, flags) - __field(void *, reserve_headq) - __field(void *, write_headq) + __field(int, reserveq) + __field(int, writeq) __field(int, grant_reserve_cycle) __field(int, grant_reserve_bytes) __field(int, grant_write_cycle) @@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_res = tic->t_curr_res; __entry->unit_res = tic->t_unit_res; __entry->flags = tic->t_flags; - __entry->reserve_headq = log->l_reserve_headq; - __entry->write_headq = log->l_write_headq; - __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; - __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; - __entry->grant_write_cycle = log->l_grant_write_cycle; - __entry->grant_write_bytes = log->l_grant_write_bytes; + __entry->reserveq = list_empty(&log->l_reserveq); + __entry->writeq = list_empty(&log->l_writeq); + xlog_crack_grant_head(&log->l_grant_reserve_head, + &__entry->grant_reserve_cycle, + &__entry->grant_reserve_bytes); + xlog_crack_grant_head(&log->l_grant_write_head, + &__entry->grant_write_cycle, + &__entry->grant_write_bytes); __entry->curr_cycle = log->l_curr_cycle; __entry->curr_block = log->l_curr_block; - __entry->tail_lsn = log->l_tail_lsn; + __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " - "t_unit_res %u t_flags %s reserve_headq 0x%p " - "write_headq 0x%p grant_reserve_cycle %d " + "t_unit_res %u t_flags %s reserveq %s " + "writeq %s grant_reserve_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d " "grant_write_bytes %d curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", @@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_res, __entry->unit_res, __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), - __entry->reserve_headq, - __entry->write_headq, + __entry->reserveq ? "empty" : "active", + __entry->writeq ? "empty" : "active", __entry->grant_reserve_cycle, __entry->grant_reserve_bytes, __entry->grant_write_cycle, @@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); @@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); @@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); -DECLARE_EVENT_CLASS(xfs_iomap_class, +DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int flags, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, flags, irec), + int type, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, type, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) + __field(int, type) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; + __entry->type = type; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " - "offset 0x%llx count %zd flags %s " + "offset 0x%llx count %zd type %s " "startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, __entry->new_size, __entry->offset, __entry->count, - __print_flags(__entry->flags, "|", BMAPI_FLAGS), + __print_symbolic(__entry->type, XFS_IO_TYPES), __entry->startoff, (__int64_t)__entry->startblock, __entry->blockcount) ) #define DEFINE_IOMAP_EVENT(name) \ -DEFINE_EVENT(xfs_iomap_class, name, \ +DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int flags, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, flags, irec)) -DEFINE_IOMAP_EVENT(xfs_iomap_enter); -DEFINE_IOMAP_EVENT(xfs_iomap_found); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); + int type, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, type, irec)) +DEFINE_IOMAP_EVENT(xfs_map_blocks_found); +DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_get_blocks_found); +DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \ TP_ARGS(ip, offset, count)) DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); +DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); TRACE_EVENT(xfs_itruncate_start, @@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \ TP_PROTO(struct xfs_alloc_arg *args), \ TP_ARGS(args)) DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_first); @@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index faf8e1a83a1..d22aa310310 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -149,7 +149,6 @@ xfs_qm_dqdestroy( ASSERT(list_empty(&dqp->q_freelist)); mutex_destroy(&dqp->q_qlock); - sv_destroy(&dqp->q_pinwait); kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); atomic_dec(&xfs_Gqm->qm_totaldquots); diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 975aa10e1a4..0df88897ef8 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -25,86 +25,78 @@ #include "xfs_mount.h" #include "xfs_error.h" -static char message[1024]; /* keep it off the stack */ -static DEFINE_SPINLOCK(xfs_err_lock); - -/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ -#define XFS_MAX_ERR_LEVEL 7 -#define XFS_ERR_MASK ((1 << 3) - 1) -static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = - {KERN_EMERG, KERN_ALERT, KERN_CRIT, - KERN_ERR, KERN_WARNING, KERN_NOTICE, - KERN_INFO, KERN_DEBUG}; - void -cmn_err(register int level, char *fmt, ...) +cmn_err( + const char *lvl, + const char *fmt, + ...) { - char *fp = fmt; - int len; - ulong flags; - va_list ap; - - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - va_start(ap, fmt); - if (*fmt == '!') fp++; - len = vsnprintf(message, sizeof(message), fp, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; - printk("%s%s\n", err_level[level], message); - va_end(ap); - spin_unlock_irqrestore(&xfs_err_lock,flags); - BUG_ON(level == CE_PANIC); + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%pV", lvl, &vaf); + va_end(args); + + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); } void -xfs_fs_vcmn_err( - int level, +xfs_fs_cmn_err( + const char *lvl, struct xfs_mount *mp, - char *fmt, - va_list ap) + const char *fmt, + ...) { - unsigned long flags; - int len = 0; + struct va_format vaf; + va_list args; - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - spin_lock_irqsave(&xfs_err_lock,flags); + printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf); + va_end(args); - if (mp) { - len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); +} + +/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */ +void +xfs_cmn_err( + int panic_tag, + const char *lvl, + struct xfs_mount *mp, + const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; - /* - * Skip the printk if we can't print anything useful - * due to an over-long device name. - */ - if (len >= sizeof(message)) - goto out; + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + printk(KERN_ALERT "XFS: Transforming an alert into a BUG."); + do_panic = 1; } - len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - printk("%s%s\n", err_level[level], message); - out: - spin_unlock_irqrestore(&xfs_err_lock,flags); + printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf); + va_end(args); - BUG_ON(level == CE_PANIC); + BUG_ON(do_panic); } void assfail(char *expr, char *file, int line) { - printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); + printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr, + file, line); BUG(); } diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index d2d20462fd4..05699f67d47 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -20,15 +20,22 @@ #include <stdarg.h> -#define CE_DEBUG 7 /* debug */ -#define CE_CONT 6 /* continuation */ -#define CE_NOTE 5 /* notice */ -#define CE_WARN 4 /* warning */ -#define CE_ALERT 1 /* alert */ -#define CE_PANIC 0 /* panic */ - -extern void cmn_err(int, char *, ...) - __attribute__ ((format (printf, 2, 3))); +struct xfs_mount; + +#define CE_DEBUG KERN_DEBUG +#define CE_CONT KERN_INFO +#define CE_NOTE KERN_NOTICE +#define CE_WARN KERN_WARNING +#define CE_ALERT KERN_ALERT +#define CE_PANIC KERN_EMERG + +void cmn_err(const char *lvl, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 4, 5))); + extern void assfail(char *expr, char *f, int l); #define ASSERT_ALWAYS(expr) \ diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 0135e2a669d..11dd72070cb 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -42,7 +42,7 @@ struct xfs_acl { #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) #ifdef CONFIG_XFS_POSIX_ACL -extern int xfs_check_acl(struct inode *inode, int mask); +extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags); extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); extern int xfs_acl_chmod(struct inode *inode); diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 63c7a1a6c02..58632cc17f2 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -227,7 +227,7 @@ typedef struct xfs_perag { atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - rwlock_t pag_ici_lock; /* incore inode lock */ + spinlock_t pag_ici_lock; /* incore inode cache lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ struct mutex pag_ici_reclaim_lock; /* serialisation point */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 112abc439ca..f3227984a9b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,10 +41,6 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -static int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - /* * Prototypes for per-ag allocation routines */ @@ -94,7 +90,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -127,7 +123,7 @@ xfs_alloc_update( /* * Get the data from the pointed-to record. */ -STATIC int /* error */ +int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t *bno, /* output: starting block of extent */ @@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact( xfs_extlen_t rlen; /* length of returned extent */ ASSERT(args->alignment == 1); + /* * Allocate/initialize a cursor for the by-number freespace btree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->agno, XFS_BTNUM_BNO); + /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). * Look for the closest free block <= bno, it must contain bno * if any free block does. */ - if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) + error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); + if (error) goto error0; - if (!i) { - /* - * Didn't find it, return null. - */ - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - args->agbno = NULLAGBLOCK; - return 0; - } + if (!i) + goto not_found; + /* * Grab the freespace record. */ - if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) + error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); + if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); ASSERT(fbno <= args->agbno); minend = args->agbno + args->minlen; maxend = args->agbno + args->maxlen; fend = fbno + flen; + /* * Give up if the freespace isn't long enough for the minimum request. */ - if (fend < minend) { - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - args->agbno = NULLAGBLOCK; - return 0; - } + if (fend < minend) + goto not_found; + /* * End of extent will be smaller of the freespace end and the * maximal requested end. - */ - end = XFS_AGBLOCK_MIN(fend, maxend); - /* + * * Fix the length according to mod and prod if given. */ + end = XFS_AGBLOCK_MIN(fend, maxend); args->len = end - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - return 0; - } + if (!xfs_alloc_fix_minleft(args)) + goto not_found; + rlen = args->len; ASSERT(args->agbno + rlen <= fend); end = args->agbno + rlen; + /* * We are allocating agbno for rlen [agbno .. end] * Allocate/initialize a cursor for the by-size btree. @@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact( args->agno, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, - args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { + error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, + args->len, XFSA_FIXUP_BNO_OK); + if (error) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); goto error0; } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_exact_done(args); args->wasfromfl = 0; + trace_xfs_alloc_exact_done(args); + return 0; + +not_found: + /* Didn't find it, return null. */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_exact_notfound(args); return 0; error0: @@ -659,6 +661,95 @@ error0: } /* + * Search the btree in a given direction via the search cursor and compare + * the records found against the good extent we've already found. + */ +STATIC int +xfs_alloc_find_best_extent( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur **gcur, /* good cursor */ + struct xfs_btree_cur **scur, /* searching cursor */ + xfs_agblock_t gdiff, /* difference for search comparison */ + xfs_agblock_t *sbno, /* extent found by search */ + xfs_extlen_t *slen, + xfs_extlen_t *slena, /* aligned length */ + int dir) /* 0 = search right, 1 = search left */ +{ + xfs_agblock_t bno; + xfs_agblock_t new; + xfs_agblock_t sdiff; + int error; + int i; + + /* The good extent is perfect, no need to search. */ + if (!gdiff) + goto out_use_good; + + /* + * Look until we find a better one, run out of space or run off the end. + */ + do { + error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(*sbno, *slen, args->alignment, + args->minlen, &bno, slena); + + /* + * The good extent is closer than this one. + */ + if (!dir) { + if (bno >= args->agbno + gdiff) + goto out_use_good; + } else { + if (bno <= args->agbno - gdiff) + goto out_use_good; + } + + /* + * Same distance, compare length and pick the best. + */ + if (*slena >= args->minlen) { + args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); + xfs_alloc_fix_len(args); + + sdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, *sbno, + *slen, &new); + + /* + * Choose closer size and invalidate other cursor. + */ + if (sdiff < gdiff) + goto out_use_search; + goto out_use_good; + } + + if (!dir) + error = xfs_btree_increment(*scur, 0, &i); + else + error = xfs_btree_decrement(*scur, 0, &i); + if (error) + goto error0; + } while (i); + +out_use_good: + xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); + *scur = NULL; + return 0; + +out_use_search: + xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); + *gcur = NULL; + return 0; + +error0: + /* caller invalidates cursors */ + return error; +} + +/* * Allocate a variable extent near bno in the allocation group agno. * Extent's length (returned in len) will be between minlen and maxlen, * and of the form k * prod + mod unless there's nothing that large. @@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near( } } } while (bno_cur_lt || bno_cur_gt); + /* * Got both cursors still active, need to find better entry. */ if (bno_cur_lt && bno_cur_gt) { - /* - * Left side is long enough, look for a right side entry. - */ if (ltlena >= args->minlen) { /* - * Fix up the length. + * Left side is good, look for a right side entry. */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - rlen = args->len; - ltdiff = xfs_alloc_compute_diff(args->agbno, rlen, + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, ltbno, ltlen, <new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_lt, &bno_cur_gt, + ltdiff, >bno, >len, >lena, + 0 /* search right */); + } else { + ASSERT(gtlena >= args->minlen); + /* - * Not perfect. - */ - if (ltdiff) { - /* - * Look until we find a better one, run out of - * space, or run off the end. - */ - while (bno_cur_lt && bno_cur_gt) { - if ((error = xfs_alloc_get_rec( - bno_cur_gt, >bno, - >len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(gtbno, gtlen, - args->alignment, args->minlen, - >bnoa, >lena); - /* - * The left one is clearly better. - */ - if (gtbnoa >= args->agbno + ltdiff) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - break; - } - /* - * If we reach a big enough entry, - * compare the two and pick the best. - */ - if (gtlena >= args->minlen) { - args->len = - XFS_EXTLEN_MIN(gtlena, - args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - gtdiff = xfs_alloc_compute_diff( - args->agbno, rlen, - args->alignment, - gtbno, gtlen, >new); - /* - * Right side is better. - */ - if (gtdiff < ltdiff) { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - /* - * Left side is better. - */ - else { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - break; - } - /* - * Fell off the right end. - */ - if ((error = xfs_btree_increment( - bno_cur_gt, 0, &i))) - goto error0; - if (!i) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - break; - } - } - } - /* - * The left side is perfect, trash the right side. - */ - else { - xfs_btree_del_cursor(bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - } - /* - * It's the right side that was found first, look left. - */ - else { - /* - * Fix up the length. + * Right side is good, look for a left side entry. */ args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); - rlen = args->len; - gtdiff = xfs_alloc_compute_diff(args->agbno, rlen, + gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, gtbno, gtlen, >new); - /* - * Right side entry isn't perfect. - */ - if (gtdiff) { - /* - * Look until we find a better one, run out of - * space, or run off the end. - */ - while (bno_cur_lt && bno_cur_gt) { - if ((error = xfs_alloc_get_rec( - bno_cur_lt, <bno, - <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena); - /* - * The right one is clearly better. - */ - if (ltbnoa <= args->agbno - gtdiff) { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - break; - } - /* - * If we reach a big enough entry, - * compare the two and pick the best. - */ - if (ltlena >= args->minlen) { - args->len = XFS_EXTLEN_MIN( - ltlena, args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - ltdiff = xfs_alloc_compute_diff( - args->agbno, rlen, - args->alignment, - ltbno, ltlen, <new); - /* - * Left side is better. - */ - if (ltdiff < gtdiff) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - /* - * Right side is better. - */ - else { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - break; - } - /* - * Fell off the left end. - */ - if ((error = xfs_btree_decrement( - bno_cur_lt, 0, &i))) - goto error0; - if (!i) { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - break; - } - } - } - /* - * The right side is perfect, trash the left side. - */ - else { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } + + error = xfs_alloc_find_best_extent(args, + &bno_cur_gt, &bno_cur_lt, + gtdiff, <bno, <len, <lena, + 1 /* search left */); } + + if (error) + goto error0; } + /* * If we couldn't get anything, give up. */ @@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near( args->agbno = NULLAGBLOCK; return 0; } + /* * At this point we have selected a freespace entry, either to the * left or to the right. If it's on the right, copy all the @@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near( j = 1; } else j = 0; + /* * Fix up the length and compute the useful address. */ @@ -2676,7 +2611,7 @@ restart: * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ -static int +int xfs_alloc_busy_search( struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 895009a9727..0ab56b32c7e 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -19,6 +19,7 @@ #define __XFS_ALLOC_H__ struct xfs_buf; +struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; @@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); #ifdef __KERNEL__ - void -xfs_alloc_busy_insert(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); void xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); #endif /* __KERNEL__ */ /* @@ -205,4 +206,18 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index a6cff8edcdb..71e90dc2aeb 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); /* * Scan the attribute list for the rest of the entries, storing @@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP); + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 04f9cca8da7..2f9e97c128a 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -634,9 +634,8 @@ xfs_btree_read_bufl( return error; } ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { + if (bp) XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - } *bpp = bp; return 0; } @@ -944,13 +943,13 @@ xfs_btree_set_refs( switch (cur->bc_btnum) { case XFS_BTNUM_BNO: case XFS_BTNUM_CNT: - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); break; default: ASSERT(0); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2686d0d54c5..98c6f73b675 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -141,8 +141,7 @@ xfs_buf_item_log_check( #define xfs_buf_item_log_check(x) #endif -STATIC void xfs_buf_error_relse(xfs_buf_t *bp); -STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* * This returns the number of log iovecs needed to log the @@ -450,7 +449,7 @@ xfs_buf_item_unpin( * xfs_trans_ail_delete() drops the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); + xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); } else { @@ -918,15 +917,26 @@ xfs_buf_attach_iodone( XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); } +/* + * We can have many callbacks on a buffer. Running the callbacks individually + * can cause a lot of contention on the AIL lock, so we allow for a single + * callback to be able to scan the remaining lip->li_bio_list for other items + * of the same type and callback to be processed in the first call. + * + * As a result, the loop walking the callback list below will also modify the + * list. it removes the first item from the list and then runs the callback. + * The loop then restarts from the new head of the list. This allows the + * callback to scan and modify the list attached to the buffer and we don't + * have to care about maintaining a next item pointer. + */ STATIC void xfs_buf_do_callbacks( - xfs_buf_t *bp, - xfs_log_item_t *lip) + struct xfs_buf *bp) { - xfs_log_item_t *nlip; + struct xfs_log_item *lip; - while (lip != NULL) { - nlip = lip->li_bio_list; + while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) { + XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list); ASSERT(lip->li_cb != NULL); /* * Clear the next pointer so we don't have any @@ -936,7 +946,6 @@ xfs_buf_do_callbacks( */ lip->li_bio_list = NULL; lip->li_cb(bp, lip); - lip = nlip; } } @@ -949,128 +958,76 @@ xfs_buf_do_callbacks( */ void xfs_buf_iodone_callbacks( - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_log_item_t *lip; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - xfs_mount_t *mp; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (likely(!XFS_BUF_GETERROR(bp))) + goto do_callbacks; - if (XFS_BUF_GETERROR(bp) != 0) { - /* - * If we've already decided to shutdown the filesystem - * because of IO errors, there's no point in giving this - * a retry. - */ - mp = lip->li_mountp; - if (XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - XFS_BUF_SUPER_STALE(bp); - trace_xfs_buf_item_iodone(bp, _RET_IP_); - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); - return; - } + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + XFS_BUF_SUPER_STALE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } - if ((XFS_BUF_TARGET(bp) != lasttarg) || - (time_after(jiffies, (lasttime + 5*HZ)))) { - lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); - } - lasttarg = XFS_BUF_TARGET(bp); + if (XFS_BUF_TARGET(bp) != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + cmn_err(CE_ALERT, "Device %s, XFS metadata write error" + " block 0x%llx in %s", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + } + lasttarg = XFS_BUF_TARGET(bp); - if (XFS_BUF_ISASYNC(bp)) { - /* - * If the write was asynchronous then noone will be - * looking for the error. Clear the error state - * and write the buffer out again delayed write. - * - * XXXsup This is OK, so long as we catch these - * before we start the umount; we don't want these - * DELWRI metadata bufs to be hanging around. - */ - XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ - - if (!(XFS_BUF_ISSTALE(bp))) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); - } else { - /* - * If the write of the buffer was not asynchronous, - * then we want to make sure to return the error - * to the caller of bwrite(). Because of this we - * cannot clear the B_ERROR state at this point. - * Instead we install a callback function that - * will be called when the buffer is released, and - * that routine will clear the error state and - * set the buffer to be written out again after - * some delay. - */ - /* We actually overwrite the existing b-relse - function at times, but we're gonna be shutting down - anyway. */ - XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + /* + * If the write was asynchronous then noone will be looking for the + * error. Clear the error state and write the buffer out again. + * + * During sync or umount we'll write all pending buffers again + * synchronous, which will catch these errors if they keep hanging + * around. + */ + if (XFS_BUF_ISASYNC(bp)) { + XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + + if (!XFS_BUF_ISSTALE(bp)) { + XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_FINISH_IOWAIT(bp); + XFS_BUF_SET_START(bp); } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_relse(bp); return; } - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); -} - -/* - * This is a callback routine attached to a buffer which gets an error - * when being written out synchronously. - */ -STATIC void -xfs_buf_error_relse( - xfs_buf_t *bp) -{ - xfs_log_item_t *lip; - xfs_mount_t *mp; - - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - mp = (xfs_mount_t *)lip->li_mountp; - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ XFS_BUF_STALE(bp); XFS_BUF_DONE(bp); XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_ERROR(bp,0); trace_xfs_buf_error_relse(bp, _RET_IP_); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - if (! XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - /* - * We have to unpin the pinned buffers so do the - * callbacks. - */ - xfs_buf_do_callbacks(bp, lip); +do_callbacks: + xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_SET_BRELSE_FUNC(bp,NULL); - xfs_buf_relse(bp); + xfs_buf_ioend(bp, 0); } - /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 0e2ed43f16c..b6ecd2061e7 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item { xfs_buf_log_format_t bli_format; /* in-log header */ } xfs_buf_log_item_t; -/* - * This structure is used during recovery to record the buf log - * items which have been canceled and should not be replayed. - */ -typedef struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; - uint bc_len; - int bc_refcount; - struct xfs_buf_cancel *bc_next; -} xfs_buf_cancel_t; - void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c78cc6a3d87..4c7db74a05f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) } #endif /* DEBUG */ - -void -xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - -void -xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - -#ifdef DEBUG - xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); -#endif - - if (xfs_panic_mask && (xfs_panic_mask & panic_tag) - && (level & CE_ALERT)) { - level &= ~CE_ALERT; - level |= CE_PANIC; - cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); - } - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - void xfs_error_report( const char *tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index f338847f80b..10dce5475f0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) @@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); struct xfs_mount; -extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp, - char *fmt, va_list ap) - __attribute__ ((format (printf, 3, 0))); -extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...) - __attribute__ ((format (printf, 4, 5))); -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); - extern void xfs_hex_dump(void *p, int length); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) + do { \ + if (!(f & XFS_MFSI_QUIET)) \ + cmn_err(CE_WARN, "XFS: " fmt, ## args); \ + } while (0) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a55e687bf56..75f2ef60e57 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -48,6 +48,28 @@ xfs_efi_item_free( } /* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the + * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees + * the EFI. + */ +STATIC void +__xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + struct xfs_ail *ailp = efip->efi_item.li_ailp; + + if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_efi_item_free(efip); + } +} + +/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -74,7 +96,8 @@ xfs_efi_item_format( struct xfs_efi_log_item *efip = EFI_ITEM(lip); uint size; - ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); + ASSERT(atomic_read(&efip->efi_next_extent) == + efip->efi_format.efi_nextents); efip->efi_format.efi_type = XFS_LI_EFI; @@ -99,10 +122,12 @@ xfs_efi_item_pin( } /* - * While EFIs cannot really be pinned, the unpin operation is the - * last place at which the EFI is manipulated during a transaction. - * Here we coordinate with xfs_efi_cancel() to determine who gets to - * free the EFI. + * While EFIs cannot really be pinned, the unpin operation is the last place at + * which the EFI is manipulated during a transaction. If we are being asked to + * remove the EFI it's because the transaction has been cancelled and by + * definition that means the EFI cannot be in the AIL so remove it from the + * transaction and free it. Otherwise coordinate with xfs_efi_release() (via + * XFS_EFI_COMMITTED) to determine who gets to free the EFI. */ STATIC void xfs_efi_item_unpin( @@ -110,20 +135,14 @@ xfs_efi_item_unpin( int remove) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_ail *ailp = lip->li_ailp; - - spin_lock(&ailp->xa_lock); - if (efip->efi_flags & XFS_EFI_CANCELED) { - if (remove) - xfs_trans_del_item(lip); - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + if (remove) { + ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); + xfs_trans_del_item(lip); xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_COMMITTED; - spin_unlock(&ailp->xa_lock); + return; } + __xfs_efi_release(efip); } /* @@ -152,16 +171,20 @@ xfs_efi_item_unlock( } /* - * The EFI is logged only once and cannot be moved in the log, so - * simply return the lsn at which it's been logged. The canceled - * flag is not paid any attention here. Checking for that is delayed - * until the EFI is unpinned. + * The EFI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. For bulk transaction committed + * processing, the EFI may be processed but not yet unpinned prior to the EFD + * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected + * when processing the EFD. */ STATIC xfs_lsn_t xfs_efi_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + + set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); return lsn; } @@ -230,6 +253,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; + atomic_set(&efip->efi_next_extent, 0); return efip; } @@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to - * the given efi item. Each efd calls this with the number of - * extents that it has logged, and when the sum of these reaches - * the total number of extents logged by this efi item we can free - * the efi item. - * - * Freeing the efi item requires that we remove it from the AIL. - * We'll use the AIL lock to protect our counters as well as - * the removal from the AIL. + * This is called by the efd item code below to release references to the given + * efi item. Each efd calls this with the number of extents that it has + * logged, and when the sum of these reaches the total number of extents logged + * by this efi item we can free the efi item. */ void xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { - struct xfs_ail *ailp = efip->efi_item.li_ailp; - int extents_left; - - ASSERT(efip->efi_next_extent > 0); - ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); - - spin_lock(&ailp->xa_lock); - ASSERT(efip->efi_next_extent >= nextents); - efip->efi_next_extent -= nextents; - extents_left = efip->efi_next_extent; - if (extents_left == 0) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); - xfs_efi_item_free(efip); - } else { - spin_unlock(&ailp->xa_lock); - } + ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + __xfs_efi_release(efip); } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0d22c56fdf6..375f68e4253 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 { #define XFS_EFI_MAX_FAST_EXTENTS 16 /* - * Define EFI flags. + * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ -#define XFS_EFI_RECOVERED 0x1 -#define XFS_EFI_COMMITTED 0x2 -#define XFS_EFI_CANCELED 0x4 +#define XFS_EFI_RECOVERED 1 +#define XFS_EFI_COMMITTED 2 /* * This is the "extent free intention" log item. It is used @@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 { */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; - uint efi_flags; /* misc flags */ - uint efi_next_extent; + atomic_t efi_next_extent; + unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; } xfs_efi_log_item_t; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a7c116e814a..cec89dd5d7d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -374,6 +374,7 @@ xfs_growfs_data_private( mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else mp->m_maxicount = 0; + xfs_set_low_space_thresholds(mp); /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { @@ -611,12 +612,13 @@ out: * * We cannot use an inode here for this - that will push dirty state back up * into the VFS and then periodic inode flushing will prevent log covering from - * making progress. Hence we log a field in the superblock instead. + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. */ int xfs_fs_log_dummy( - xfs_mount_t *mp, - int flags) + xfs_mount_t *mp) { xfs_trans_t *tp; int error; @@ -631,8 +633,7 @@ xfs_fs_log_dummy( /* log the UUID because it is an unchanging field */ xfs_mod_sb(tp, XFS_SB_UUID); - if (flags & SYNC_WAIT) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index a786c5212c1..1b6a98b6688 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0cdd26932d8..cb9b6d1469f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -43,6 +43,17 @@ /* + * Define xfs inode iolock lockdep classes. We need to ensure that all active + * inodes are considered the same for lockdep purposes, including inodes that + * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to + * guarantee the locks are considered the same when there are multiple lock + * initialisation siteÑ•. Also, define a reclaimable inode class so it is + * obvious in lockdep reports which class the report is against. + */ +static struct lock_class_key xfs_iolock_active; +struct lock_class_key xfs_iolock_reclaimable; + +/* * Allocate and initialise an xfs_inode. */ STATIC struct xfs_inode * @@ -69,8 +80,11 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); + ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); /* initialise the xfs inode */ ip->i_ino = ino; @@ -85,12 +99,20 @@ xfs_inode_alloc( ip->i_size = 0; ip->i_new_size = 0; - /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW; - return ip; } +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + INIT_LIST_HEAD(&inode->i_dentry); + kmem_zone_free(xfs_inode_zone, ip); +} + void xfs_inode_free( struct xfs_inode *ip) @@ -134,7 +156,18 @@ xfs_inode_free( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - kmem_zone_free(xfs_inode_zone, ip); + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } /* @@ -144,14 +177,29 @@ static int xfs_iget_cache_hit( struct xfs_perag *pag, struct xfs_inode *ip, + xfs_ino_t ino, int flags, - int lock_flags) __releases(pag->pag_ici_lock) + int lock_flags) __releases(RCU) { struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; int error; + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + /* * If we are racing with another cache hit that is currently @@ -194,7 +242,7 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_IRECLAIM; spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); error = -inode_init_always(mp->m_super, inode); if (error) { @@ -202,7 +250,7 @@ xfs_iget_cache_hit( * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); spin_lock(&ip->i_flags_lock); ip->i_flags &= ~XFS_INEW; @@ -212,14 +260,20 @@ xfs_iget_cache_hit( goto out_error; } - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); ip->i_flags |= XFS_INEW; __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); + spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -230,7 +284,7 @@ xfs_iget_cache_hit( /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); trace_xfs_iget_hit(ip); } @@ -244,7 +298,7 @@ xfs_iget_cache_hit( out_error: spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); return error; } @@ -297,7 +351,7 @@ xfs_iget_cache_miss( BUG(); } - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); /* insert the new inode */ error = radix_tree_insert(&pag->pag_ici_root, agino, ip); @@ -312,14 +366,14 @@ xfs_iget_cache_miss( ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); *ipp = ip; return 0; out_preload_end: - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); if (lock_flags) xfs_iunlock(ip, lock_flags); @@ -366,7 +420,7 @@ xfs_iget( xfs_agino_t agino; /* reject inode numbers outside existing AGs */ - if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return EINVAL; /* get the perag structure and ensure that it's inode capable */ @@ -375,15 +429,15 @@ xfs_iget( again: error = 0; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); if (ip) { - error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); if (error) goto out_error_or_again; } else { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); XFS_STATS_INC(xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 108c7a085f9..be7cf625421 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -887,7 +887,7 @@ xfs_iread( * around for a while. This helps to keep recently accessed * meta-data in-core longer. */ - XFS_BUF_SET_REF(bp, XFS_INO_REF); + xfs_buf_set_ref(bp, XFS_INO_REF); /* * Use xfs_trans_brelse() to release the buffer containing the @@ -2000,17 +2000,33 @@ xfs_ifree_cluster( */ for (i = 0; i < ninodes; i++) { retry: - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); - /* Inode not in memory or stale, nothing to do */ - if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { - read_unlock(&pag->pag_ici_lock); + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); continue; } /* + * because this is an RCU protected lookup, we could + * find a recently freed or even reallocated inode + * during the lookup. We need to check under the + * i_flags_lock for a valid inode here. Skip it if it + * is not valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum + i || + __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* * Don't try to lock/unlock the current inode, but we * _cannot_ skip the other inodes that we did not find * in the list attached to the buffer and are not @@ -2019,11 +2035,11 @@ retry: */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); delay(1); goto retry; } - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); @@ -2629,7 +2645,7 @@ xfs_iflush_cluster( mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); /* really need a gang lookup range call here */ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, first_index, inodes_per_cluster); @@ -2640,9 +2656,21 @@ xfs_iflush_cluster( iq = ilist[i]; if (iq == ip) continue; - /* if the inode lies outside this cluster, we're done. */ - if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) - break; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino || + (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated @@ -2692,7 +2720,7 @@ xfs_iflush_cluster( } out_free: - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); kmem_free(ilist); out_put: xfs_perag_put(pag); @@ -2704,7 +2732,7 @@ cluster_corrupt_out: * Corruption detected in the clustering loop. Invalidate the * inode buffer and shut down the filesystem. */ - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); /* * Clean up the buffer. If it was B_DELWRI, just release it -- * brelse can handle it with no problems. If not, shut down the diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fb2ca2e4cdc..5c95fa8ec11 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) /* * In-core inode flags. */ -#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ -#define XFS_ISTALE 0x0002 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ -#define XFS_INEW 0x0008 /* inode has just been allocated */ -#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ -#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ +#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ +#define XFS_ISTALE 0x0002 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ +#define XFS_INEW 0x0008 /* inode has just been allocated */ +#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ +#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ /* * Flags for inode locking. @@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) +extern struct lock_class_key xfs_iolock_reclaimable; + /* * Flags for xfs_itruncate_start(). */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7c8d30c453c..fd4f398bd6f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -842,15 +842,64 @@ xfs_inode_item_destroy( * flushed to disk. It is responsible for removing the inode item * from the AIL if it has not been re-logged, and unlocking the inode's * flush lock. + * + * To reduce AIL lock traffic as much as possible, we scan the buffer log item + * list for other inodes that will run this function. We remove them from the + * buffer list so we can process all the inode IO completions in one AIL lock + * traversal. */ void xfs_iflush_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - xfs_inode_t *ip = iip->ili_inode; + struct xfs_inode_log_item *iip; + struct xfs_log_item *blip; + struct xfs_log_item *next; + struct xfs_log_item *prev; struct xfs_ail *ailp = lip->li_ailp; + int need_ail = 0; + + /* + * Scan the buffer IO completions for other inodes being completed and + * attach them to the current inode log item. + */ + blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + prev = NULL; + while (blip != NULL) { + if (lip->li_cb != xfs_iflush_done) { + prev = blip; + blip = blip->li_bio_list; + continue; + } + + /* remove from list */ + next = blip->li_bio_list; + if (!prev) { + XFS_BUF_SET_FSPRIVATE(bp, next); + } else { + prev->li_bio_list = next; + } + + /* add to current list */ + blip->li_bio_list = lip->li_bio_list; + lip->li_bio_list = blip; + + /* + * while we have the item, do the unlocked check for needing + * the AIL lock. + */ + iip = INODE_ITEM(blip); + if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + blip = next; + } + + /* make sure we capture the state of the initial inode. */ + iip = INODE_ITEM(lip); + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + need_ail++; /* * We only want to pull the item from the AIL if it is @@ -861,28 +910,37 @@ xfs_iflush_done( * the lock since it's cheaper, and then we recheck while * holding the lock before removing the inode from the AIL. */ - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { + if (need_ail) { + struct xfs_log_item *log_items[need_ail]; + int i = 0; spin_lock(&ailp->xa_lock); - if (lip->li_lsn == iip->ili_flush_lsn) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, lip); - } else { - spin_unlock(&ailp->xa_lock); + for (blip = lip; blip; blip = blip->li_bio_list) { + iip = INODE_ITEM(blip); + if (iip->ili_logged && + blip->li_lsn == iip->ili_flush_lsn) { + log_items[i++] = blip; + } + ASSERT(i <= need_ail); } + /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ + xfs_trans_ail_delete_bulk(ailp, log_items, i); } - iip->ili_logged = 0; /* - * Clear the ili_last_fields bits now that we know that the - * data corresponding to them is safely on disk. + * clean up and unlock the flush lock now we are done. We can clear the + * ili_last_fields bits now that we know that the data corresponding to + * them is safely on disk. */ - iip->ili_last_fields = 0; + for (blip = lip; blip; blip = next) { + next = blip->li_bio_list; + blip->li_bio_list = NULL; - /* - * Release the inode's flush lock since we're done with it. - */ - xfs_ifunlock(ip); + iip = INODE_ITEM(blip); + iip->ili_logged = 0; + iip->ili_last_fields = 0; + xfs_ifunlock(iip->ili_inode); + } } /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 20576146369..55582bd6665 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -47,127 +47,8 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -#define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *); -STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); - -int -xfs_iomap( - struct xfs_inode *ip, - xfs_off_t offset, - ssize_t count, - int flags, - struct xfs_bmbt_irec *imap, - int *nimaps, - int *new) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - int bmapi_flags = 0; - - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - - *new = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - trace_xfs_iomap_enter(ip, offset, count, flags, NULL); - - switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) { - case BMAPI_READ: - lockmode = xfs_ilock_map_shared(ip); - bmapi_flags = XFS_BMAPI_ENTIRE; - break; - case BMAPI_WRITE: - lockmode = XFS_ILOCK_EXCL; - if (flags & BMAPI_IGNSTATE) - bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; - xfs_ilock(ip, lockmode); - break; - case BMAPI_ALLOCATE: - lockmode = XFS_ILOCK_SHARED; - bmapi_flags = XFS_BMAPI_ENTIRE; - - /* Attempt non-blocking lock */ - if (flags & BMAPI_TRYLOCK) { - if (!xfs_ilock_nowait(ip, lockmode)) - return XFS_ERROR(EAGAIN); - } else { - xfs_ilock(ip, lockmode); - } - break; - default: - BUG(); - } - - ASSERT(offset <= mp->m_maxioffset); - if ((xfs_fsize_t)offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, imap, - nimaps, NULL); - - if (error) - goto out; - - switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { - case BMAPI_WRITE: - /* If we found an extent, return it */ - if (*nimaps && - (imap->br_startblock != HOLESTARTBLOCK) && - (imap->br_startblock != DELAYSTARTBLOCK)) { - trace_xfs_iomap_found(ip, offset, count, flags, imap); - break; - } - - if (flags & BMAPI_DIRECT) { - error = xfs_iomap_write_direct(ip, offset, count, flags, - imap, nimaps); - } else { - error = xfs_iomap_write_delay(ip, offset, count, flags, - imap, nimaps); - } - if (!error) { - trace_xfs_iomap_alloc(ip, offset, count, flags, imap); - } - *new = 1; - break; - case BMAPI_ALLOCATE: - /* If we found an extent, return it */ - xfs_iunlock(ip, lockmode); - lockmode = 0; - - if (*nimaps && !isnullstartblock(imap->br_startblock)) { - trace_xfs_iomap_found(ip, offset, count, flags, imap); - break; - } - - error = xfs_iomap_write_allocate(ip, offset, count, - imap, nimaps); - break; - } - - ASSERT(*nimaps <= 1); - -out: - if (lockmode) - xfs_iunlock(ip, lockmode); - return XFS_ERROR(error); -} - STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, @@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero( return EFSCORRUPTED; } -STATIC int +int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int flags, xfs_bmbt_irec_t *imap, - int *nmaps) + int nmaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -279,7 +159,7 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) + if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) imap->br_blockcount + imap->br_startoff); @@ -331,7 +211,7 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; - if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) + if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -370,7 +250,6 @@ xfs_iomap_write_direct( goto error_out; } - *nmaps = 1; return 0; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ @@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - *nmaps = 0; /* nothing set-up here */ error_out: return XFS_ERROR(error); @@ -389,6 +267,9 @@ error_out: * If the caller is doing a write at the end of the file, then extend the * allocation out to the file system's write iosize. We clean up any extra * space left over when the file is closed in xfs_inactive(). + * + * If we find we already have delalloc preallocation beyond EOF, don't do more + * preallocation as it it not needed. */ STATIC int xfs_iomap_eof_want_preallocate( @@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int ioflag, xfs_bmbt_irec_t *imap, int nimaps, int *prealloc) @@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate( xfs_filblks_t count_fsb; xfs_fsblock_t firstblock; int n, error, imaps; + int found_delalloc = 0; *prealloc = 0; if ((offset + count) <= ip->i_size) @@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate( return 0; start_fsb += imap[n].br_blockcount; count_fsb -= imap[n].br_blockcount; + + if (imap[n].br_startblock == DELAYSTARTBLOCK) + found_delalloc = 1; } } - *prealloc = 1; + if (!found_delalloc) + *prealloc = 1; return 0; } -STATIC int +/* + * If we don't have a user specified preallocation size, dynamically increase + * the preallocation size as the size of the file grows. Cap the maximum size + * at a single extent or less if the filesystem is near full. The closer the + * filesystem is to full, the smaller the maximum prealocation. + */ +STATIC xfs_fsblock_t +xfs_iomap_prealloc_size( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + xfs_fsblock_t alloc_blocks = 0; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + int shift = 0; + int64_t freesp; + + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size); + alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, + rounddown_pow_of_two(alloc_blocks)); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + if (shift) + alloc_blocks >>= shift; + } + + if (alloc_blocks < mp->m_writeio_blocks) + alloc_blocks = mp->m_writeio_blocks; + + return alloc_blocks; +} + +int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int ioflag, - xfs_bmbt_irec_t *ret_imap, - int *nmaps) + xfs_bmbt_irec_t *ret_imap) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -469,16 +396,19 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, - ioflag, imap, XFS_WRITE_IMAPS, &prealloc); + imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; retry: if (prealloc) { + xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + mp->m_writeio_blocks; + last_fsb = ioalign + alloc_blocks; } else { last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); } @@ -496,22 +426,31 @@ retry: XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, &nimaps, NULL); - if (error && (error != ENOSPC)) + switch (error) { + case 0: + case ENOSPC: + case EDQUOT: + break; + default: return XFS_ERROR(error); + } /* - * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space - flush all other inodes with - * delalloc blocks and retry without EOF preallocation. + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For + * ENOSPC, * flush all other inodes with delalloc blocks to free up + * some of the excess reserved metadata space. For both cases, retry + * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); if (flushed) - return XFS_ERROR(ENOSPC); + return XFS_ERROR(error ? error : ENOSPC); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error == ENOSPC) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inodes(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + } flushed = 1; error = 0; @@ -523,8 +462,6 @@ retry: return xfs_cmn_err_fsblock_zero(ip, &imap[0]); *ret_imap = imap[0]; - *nmaps = 1; - return 0; } @@ -538,13 +475,12 @@ retry: * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ -STATIC int +int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, - xfs_bmbt_irec_t *imap, - int *retmap) + xfs_bmbt_irec_t *imap) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb, last_block; @@ -557,8 +493,6 @@ xfs_iomap_write_allocate( int error = 0; int nres; - *retmap = 0; - /* * Make sure that the dquots are there. */ @@ -680,7 +614,6 @@ xfs_iomap_write_allocate( if ((offset_fsb >= imap->br_startoff) && (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { - *retmap = 1; XFS_STATS_INC(xs_xstrat_quick); return 0; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7748a430f50..80615760959 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,30 +18,15 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -/* base extent manipulation calls */ -#define BMAPI_READ (1 << 0) /* read extents */ -#define BMAPI_WRITE (1 << 1) /* create extents */ -#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */ - -/* modifiers */ -#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */ -#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */ -#define BMAPI_MMA (1 << 6) /* allocate for mmap write */ -#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */ - -#define BMAPI_FLAGS \ - { BMAPI_READ, "READ" }, \ - { BMAPI_WRITE, "WRITE" }, \ - { BMAPI_ALLOCATE, "ALLOCATE" }, \ - { BMAPI_IGNSTATE, "IGNSTATE" }, \ - { BMAPI_DIRECT, "DIRECT" }, \ - { BMAPI_TRYLOCK, "TRYLOCK" } - struct xfs_inode; struct xfs_bmbt_irec; -extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, - struct xfs_bmbt_irec *, int *, int *); +extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int); +extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index cee4ab9f8a9..ae6fef1ff56 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); +STATIC int xlog_space_left(struct log *log, atomic64_t *head); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); @@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); /* local functions to manipulate grant head */ STATIC int xlog_grant_log_space(xlog_t *log, xlog_ticket_t *xtic); -STATIC void xlog_grant_push_ail(xfs_mount_t *mp, +STATIC void xlog_grant_push_ail(struct log *log, int need_bytes); STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket); @@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); -STATIC void xlog_verify_grant_head(xlog_t *log, int equals); +STATIC void xlog_verify_grant_tail(struct log *log); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) -#define xlog_verify_grant_head(a,b) +#define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c,d) #define xlog_verify_tail_lsn(a,b,c) #endif STATIC int xlog_iclogs_empty(xlog_t *log); - static void -xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) +xlog_grant_sub_space( + struct log *log, + atomic64_t *head, + int bytes) { - if (*qp) { - tic->t_next = (*qp); - tic->t_prev = (*qp)->t_prev; - (*qp)->t_prev->t_next = tic; - (*qp)->t_prev = tic; - } else { - tic->t_prev = tic->t_next = tic; - *qp = tic; - } + int64_t head_val = atomic64_read(head); + int64_t new, old; - tic->t_flags |= XLOG_TIC_IN_Q; -} + do { + int cycle, space; -static void -xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) -{ - if (tic == tic->t_next) { - *qp = NULL; - } else { - *qp = tic->t_next; - tic->t_next->t_prev = tic->t_prev; - tic->t_prev->t_next = tic->t_next; - } + xlog_crack_grant_head_val(head_val, &cycle, &space); - tic->t_next = tic->t_prev = NULL; - tic->t_flags &= ~XLOG_TIC_IN_Q; + space -= bytes; + if (space < 0) { + space += log->l_logsize; + cycle--; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); } static void -xlog_grant_sub_space(struct log *log, int bytes) +xlog_grant_add_space( + struct log *log, + atomic64_t *head, + int bytes) { - log->l_grant_write_bytes -= bytes; - if (log->l_grant_write_bytes < 0) { - log->l_grant_write_bytes += log->l_logsize; - log->l_grant_write_cycle--; - } - - log->l_grant_reserve_bytes -= bytes; - if ((log)->l_grant_reserve_bytes < 0) { - log->l_grant_reserve_bytes += log->l_logsize; - log->l_grant_reserve_cycle--; - } + int64_t head_val = atomic64_read(head); + int64_t new, old; -} + do { + int tmp; + int cycle, space; -static void -xlog_grant_add_space_write(struct log *log, int bytes) -{ - int tmp = log->l_logsize - log->l_grant_write_bytes; - if (tmp > bytes) - log->l_grant_write_bytes += bytes; - else { - log->l_grant_write_cycle++; - log->l_grant_write_bytes = bytes - tmp; - } -} + xlog_crack_grant_head_val(head_val, &cycle, &space); -static void -xlog_grant_add_space_reserve(struct log *log, int bytes) -{ - int tmp = log->l_logsize - log->l_grant_reserve_bytes; - if (tmp > bytes) - log->l_grant_reserve_bytes += bytes; - else { - log->l_grant_reserve_cycle++; - log->l_grant_reserve_bytes = bytes - tmp; - } -} + tmp = log->l_logsize - space; + if (tmp > bytes) + space += bytes; + else { + space = bytes - tmp; + cycle++; + } -static inline void -xlog_grant_add_space(struct log *log, int bytes) -{ - xlog_grant_add_space_write(log, bytes); - xlog_grant_add_space_reserve(log, bytes); + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); } static void @@ -355,7 +330,7 @@ xfs_log_reserve( trace_xfs_log_reserve(log, internal_ticket); - xlog_grant_push_ail(mp, internal_ticket->t_unit_res); + xlog_grant_push_ail(log, internal_ticket->t_unit_res); retval = xlog_regrant_write_log_space(log, internal_ticket); } else { /* may sleep if need to allocate more tickets */ @@ -369,7 +344,7 @@ xfs_log_reserve( trace_xfs_log_reserve(log, internal_ticket); - xlog_grant_push_ail(mp, + xlog_grant_push_ail(log, (internal_ticket->t_unit_res * internal_ticket->t_cnt)); retval = xlog_grant_log_space(log, internal_ticket); @@ -402,7 +377,7 @@ xfs_log_mount( cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); else { cmn_err(CE_NOTE, - "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", + "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", mp->m_fsname); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (!(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY)) { if (!XLOG_FORCED_SHUTDOWN(log)) { - sv_wait(&iclog->ic_force_wait, PMEM, - &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); } else { spin_unlock(&log->l_icloglock); } @@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) || iclog->ic_state == XLOG_STATE_DIRTY || iclog->ic_state == XLOG_STATE_IOERROR) ) { - sv_wait(&iclog->ic_force_wait, PMEM, - &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); } else { spin_unlock(&log->l_icloglock); } @@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp, { xlog_ticket_t *tic; xlog_t *log = mp->m_log; - int need_bytes, free_bytes, cycle, bytes; + int need_bytes, free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) return; - if (tail_lsn == 0) { - /* needed since sync_lsn is 64 bits */ - spin_lock(&log->l_icloglock); - tail_lsn = log->l_last_sync_lsn; - spin_unlock(&log->l_icloglock); - } - - spin_lock(&log->l_grant_lock); + if (tail_lsn == 0) + tail_lsn = atomic64_read(&log->l_last_sync_lsn); - /* Also an invalid lsn. 1 implies that we aren't passing in a valid - * tail_lsn. - */ - if (tail_lsn != 1) { - log->l_tail_lsn = tail_lsn; - } + /* tail_lsn == 1 implies that we weren't passed a valid value. */ + if (tail_lsn != 1) + atomic64_set(&log->l_tail_lsn, tail_lsn); - if ((tic = log->l_write_headq)) { + if (!list_empty_careful(&log->l_writeq)) { #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("Recovery problem"); #endif - cycle = log->l_grant_write_cycle; - bytes = log->l_grant_write_bytes; - free_bytes = xlog_space_left(log, cycle, bytes); - do { + spin_lock(&log->l_grant_write_lock); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + list_for_each_entry(tic, &log->l_writeq, t_queue) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); if (free_bytes < tic->t_unit_res && tail_lsn != 1) break; tail_lsn = 0; free_bytes -= tic->t_unit_res; - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_write_headq); + trace_xfs_log_regrant_write_wake_up(log, tic); + wake_up(&tic->t_wait); + } + spin_unlock(&log->l_grant_write_lock); } - if ((tic = log->l_reserve_headq)) { + + if (!list_empty_careful(&log->l_reserveq)) { #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("Recovery problem"); #endif - cycle = log->l_grant_reserve_cycle; - bytes = log->l_grant_reserve_bytes; - free_bytes = xlog_space_left(log, cycle, bytes); - do { + spin_lock(&log->l_grant_reserve_lock); + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + list_for_each_entry(tic, &log->l_reserveq, t_queue) { if (tic->t_flags & XLOG_TIC_PERM_RESERV) need_bytes = tic->t_unit_res*tic->t_cnt; else @@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= need_bytes; - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_reserve_headq); + trace_xfs_log_grant_wake_up(log, tic); + wake_up(&tic->t_wait); + } + spin_unlock(&log->l_grant_reserve_lock); } - spin_unlock(&log->l_grant_lock); -} /* xfs_log_move_tail */ +} /* * Determine if we have a transaction that has gone to disk @@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp) * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn(xfs_mount_t *mp) +xlog_assign_tail_lsn( + struct xfs_mount *mp) { - xfs_lsn_t tail_lsn; - xlog_t *log = mp->m_log; + xfs_lsn_t tail_lsn; + struct log *log = mp->m_log; tail_lsn = xfs_trans_ail_tail(mp->m_ail); - spin_lock(&log->l_grant_lock); - if (tail_lsn != 0) { - log->l_tail_lsn = tail_lsn; - } else { - tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn; - } - spin_unlock(&log->l_grant_lock); + if (!tail_lsn) + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; -} /* xlog_assign_tail_lsn */ - +} /* * Return the space in the log between the tail and the head. The head @@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp) * result is that we return the size of the log as the amount of space left. */ STATIC int -xlog_space_left(xlog_t *log, int cycle, int bytes) -{ - int free_bytes; - int tail_bytes; - int tail_cycle; - - tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); - tail_cycle = CYCLE_LSN(log->l_tail_lsn); - if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { - free_bytes = log->l_logsize - (bytes - tail_bytes); - } else if ((tail_cycle + 1) < cycle) { +xlog_space_left( + struct log *log, + atomic64_t *head) +{ + int free_bytes; + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + free_bytes = log->l_logsize - (head_bytes - tail_bytes); + else if (tail_cycle + 1 < head_cycle) return 0; - } else if (tail_cycle < cycle) { - ASSERT(tail_cycle == (cycle - 1)); - free_bytes = tail_bytes - bytes; + else if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + free_bytes = tail_bytes - head_bytes; } else { /* * The reservation head is behind the tail. @@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes) "xlog_space_left: head behind tail\n" " tail_cycle = %d, tail_bytes = %d\n" " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, cycle, bytes); + tail_cycle, tail_bytes, head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } return free_bytes; -} /* xlog_space_left */ +} /* @@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_flags |= XLOG_ACTIVE_RECOVERY; log->l_prev_block = -1; - log->l_tail_lsn = xlog_assign_lsn(1, 0); /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ - log->l_last_sync_lsn = log->l_tail_lsn; + xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - log->l_grant_reserve_cycle = 1; - log->l_grant_write_cycle = 1; + xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0); + xlog_assign_grant_head(&log->l_grant_write_head, 1, 0); + INIT_LIST_HEAD(&log->l_reserveq); + INIT_LIST_HEAD(&log->l_writeq); + spin_lock_init(&log->l_grant_reserve_lock); + spin_lock_init(&log->l_grant_write_lock); error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { @@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); - spin_lock_init(&log->l_grant_lock); - sv_init(&log->l_flush_wait, 0, "flush_wait"); + init_waitqueue_head(&log->l_flush_wait); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp, ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); - sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); - sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + init_waitqueue_head(&iclog->ic_force_wait); + init_waitqueue_head(&iclog->ic_write_wait); iclogp = &iclog->ic_next; } @@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp, out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - if (iclog->ic_bp) { - sv_destroy(&iclog->ic_force_wait); - sv_destroy(&iclog->ic_write_wait); + if (iclog->ic_bp) xfs_buf_free(iclog->ic_bp); - } kmem_free(iclog); } spinlock_destroy(&log->l_icloglock); - spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); @@ -1223,61 +1189,60 @@ xlog_commit_record( * water mark. In this manner, we would be creating a low water mark. */ STATIC void -xlog_grant_push_ail(xfs_mount_t *mp, - int need_bytes) +xlog_grant_push_ail( + struct log *log, + int need_bytes) { - xlog_t *log = mp->m_log; /* pointer to the log */ - xfs_lsn_t tail_lsn; /* lsn of the log tail */ - xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ - int free_blocks; /* free blocks left to write to */ - int free_bytes; /* free bytes left to write to */ - int threshold_block; /* block in lsn we'd like to be at */ - int threshold_cycle; /* lsn cycle we'd like to be at */ - int free_threshold; - - ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - - spin_lock(&log->l_grant_lock); - free_bytes = xlog_space_left(log, - log->l_grant_reserve_cycle, - log->l_grant_reserve_bytes); - tail_lsn = log->l_tail_lsn; - free_blocks = BTOBBT(free_bytes); - - /* - * Set the threshold for the minimum number of free blocks in the - * log to the maximum of what the caller needs, one quarter of the - * log, and 256 blocks. - */ - free_threshold = BTOBB(need_bytes); - free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); - free_threshold = MAX(free_threshold, 256); - if (free_blocks < free_threshold) { - threshold_block = BLOCK_LSN(tail_lsn) + free_threshold; - threshold_cycle = CYCLE_LSN(tail_lsn); + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks >= free_threshold) + return; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; if (threshold_block >= log->l_logBBsize) { - threshold_block -= log->l_logBBsize; - threshold_cycle += 1; + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; } - threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; - /* Don't pass in an lsn greater than the lsn of the last - * log record known to be on disk. + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. */ - if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) - threshold_lsn = log->l_last_sync_lsn; - } - spin_unlock(&log->l_grant_lock); - - /* - * Get the transaction layer to kick the dirty buffers out to - * disk asynchronously. No point in trying to do this if - * the filesystem is shutting down. - */ - if (threshold_lsn && - !XLOG_FORCED_SHUTDOWN(log)) - xfs_trans_ail_push(log->l_ailp, threshold_lsn); -} /* xlog_grant_push_ail */ + if (!XLOG_FORCED_SHUTDOWN(log)) + xfs_trans_ail_push(log->l_ailp, threshold_lsn); +} /* * The bdstrat callback function for log bufs. This gives us a central @@ -1372,9 +1337,8 @@ xlog_sync(xlog_t *log, roundoff < BBTOB(1))); /* move grant heads by roundoff in sync */ - spin_lock(&log->l_grant_lock); - xlog_grant_add_space(log, roundoff); - spin_unlock(&log->l_grant_lock); + xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff); + xlog_grant_add_space(log, &log->l_grant_write_head, roundoff); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); @@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log) iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { - sv_destroy(&iclog->ic_force_wait); - sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; } spinlock_destroy(&log->l_icloglock); - spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); log->l_mp->m_log = NULL; @@ -2232,7 +2193,7 @@ xlog_state_do_callback( lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { iclog = iclog->ic_next; continue; /* Leave this iclog for * another thread */ @@ -2240,23 +2201,21 @@ xlog_state_do_callback( iclog->ic_state = XLOG_STATE_CALLBACK; - spin_unlock(&log->l_icloglock); - /* l_last_sync_lsn field protected by - * l_grant_lock. Don't worry about iclog's lsn. - * No one else can be here except us. + /* + * update the last_sync_lsn before we drop the + * icloglock to ensure we are the only one that + * can update it. */ - spin_lock(&log->l_grant_lock); - ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - log->l_last_sync_lsn = - be64_to_cpu(iclog->ic_header.h_lsn); - spin_unlock(&log->l_grant_lock); + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); - } else { - spin_unlock(&log->l_icloglock); + } else ioerrors++; - } + + spin_unlock(&log->l_icloglock); /* * Keep processing entries in the callback list until @@ -2297,7 +2256,7 @@ xlog_state_do_callback( xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ - sv_broadcast(&iclog->ic_force_wait); + wake_up_all(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2344,7 +2303,7 @@ xlog_state_do_callback( spin_unlock(&log->l_icloglock); if (wake) - sv_broadcast(&log->l_flush_wait); + wake_up_all(&log->l_flush_wait); } @@ -2395,7 +2354,7 @@ xlog_state_done_syncing( * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ - sv_broadcast(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ @@ -2444,7 +2403,7 @@ restart: XFS_STATS_INC(xs_log_noiclogs); /* Wait for log writes to have flushed */ - sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); + xlog_wait(&log->l_flush_wait, &log->l_icloglock); goto restart; } @@ -2527,6 +2486,18 @@ restart: * * Once a ticket gets put onto the reserveq, it will only return after * the needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off the reserveq under the + * l_grant_reserve_lock, we only need to take that lock if we are going + * to add the ticket to the queue and sleep. We can avoid taking the lock if the + * ticket was never added to the reserveq because the t_queue list head will be + * empty and we hold the only reference to it so it can safely be checked + * unlocked. */ STATIC int xlog_grant_log_space(xlog_t *log, @@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log, { int free_bytes; int need_bytes; -#ifdef DEBUG - xfs_lsn_t tail_lsn; -#endif - #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("grant Recovery problem"); #endif - /* Is there space or do we need to sleep? */ - spin_lock(&log->l_grant_lock); - trace_xfs_log_grant_enter(log, tic); + need_bytes = tic->t_unit_res; + if (tic->t_flags & XFS_LOG_PERM_RESERV) + need_bytes *= tic->t_ocnt; + /* something is already sleeping; insert new transaction at end */ - if (log->l_reserve_headq) { - xlog_ins_ticketq(&log->l_reserve_headq, tic); + if (!list_empty_careful(&log->l_reserveq)) { + spin_lock(&log->l_grant_reserve_lock); + /* recheck the queue now we are locked */ + if (list_empty(&log->l_reserveq)) { + spin_unlock(&log->l_grant_reserve_lock); + goto redo; + } + list_add_tail(&tic->t_queue, &log->l_reserveq); trace_xfs_log_grant_sleep1(log, tic); @@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log, goto error_return; XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + /* * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... */ trace_xfs_log_grant_wake1(log, tic); - spin_lock(&log->l_grant_lock); } - if (tic->t_flags & XFS_LOG_PERM_RESERV) - need_bytes = tic->t_unit_res*tic->t_ocnt; - else - need_bytes = tic->t_unit_res; redo: if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; + goto error_return_unlocked; - free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, - log->l_grant_reserve_bytes); + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); if (free_bytes < need_bytes) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_reserve_headq, tic); + spin_lock(&log->l_grant_reserve_lock); + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_reserveq); trace_xfs_log_grant_sleep2(log, tic); - spin_unlock(&log->l_grant_lock); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); - - XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); - - spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - trace_xfs_log_grant_wake2(log, tic); + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + trace_xfs_log_grant_wake2(log, tic); goto redo; - } else if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); + } - /* we've got enough space */ - xlog_grant_add_space(log, need_bytes); -#ifdef DEBUG - tail_lsn = log->l_tail_lsn; - /* - * Check to make sure the grant write head didn't just over lap the - * tail. If the cycles are the same, we can't be overlapping. - * Otherwise, make sure that the cycles differ by exactly one and - * check the byte count. - */ - if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { - ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); - ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); + if (!list_empty(&tic->t_queue)) { + spin_lock(&log->l_grant_reserve_lock); + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_reserve_lock); } -#endif + + /* we've got enough space */ + xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_grant_exit(log, tic); - xlog_verify_grant_head(log, 1); - spin_unlock(&log->l_grant_lock); + xlog_verify_grant_tail(log); return 0; - error_return: - if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - +error_return_unlocked: + spin_lock(&log->l_grant_reserve_lock); +error_return: + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_reserve_lock); trace_xfs_log_grant_error(log, tic); /* @@ -2638,7 +2597,6 @@ redo: */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_grant_log_space */ @@ -2646,17 +2604,14 @@ redo: /* * Replenish the byte reservation required by moving the grant write head. * - * + * Similar to xlog_grant_log_space, the function is structured to have a lock + * free fast path. */ STATIC int xlog_regrant_write_log_space(xlog_t *log, xlog_ticket_t *tic) { int free_bytes, need_bytes; - xlog_ticket_t *ntic; -#ifdef DEBUG - xfs_lsn_t tail_lsn; -#endif tic->t_curr_res = tic->t_unit_res; xlog_tic_reset_res(tic); @@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log, panic("regrant Recovery problem"); #endif - spin_lock(&log->l_grant_lock); - trace_xfs_log_regrant_write_enter(log, tic); - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; + goto error_return_unlocked; /* If there are other waiters on the queue then give them a * chance at logspace before us. Wake up the first waiters, @@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log, * this transaction. */ need_bytes = tic->t_unit_res; - if ((ntic = log->l_write_headq)) { - free_bytes = xlog_space_left(log, log->l_grant_write_cycle, - log->l_grant_write_bytes); - do { + if (!list_empty_careful(&log->l_writeq)) { + struct xlog_ticket *ntic; + + spin_lock(&log->l_grant_write_lock); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + list_for_each_entry(ntic, &log->l_writeq, t_queue) { ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); if (free_bytes < ntic->t_unit_res) break; free_bytes -= ntic->t_unit_res; - sv_signal(&ntic->t_wait); - ntic = ntic->t_next; - } while (ntic != log->l_write_headq); - - if (ntic != log->l_write_headq) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_write_headq, tic); + wake_up(&ntic->t_wait); + } + if (ntic != list_first_entry(&log->l_writeq, + struct xlog_ticket, t_queue)) { + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_writeq); trace_xfs_log_regrant_write_sleep1(log, tic); - spin_unlock(&log->l_grant_lock); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); + xlog_grant_push_ail(log, need_bytes); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_wait, PINOD|PLTWAIT, - &log->l_grant_lock, s); - - /* If we're shutting down, this tic is already - * off the queue */ - spin_lock(&log->l_grant_lock); - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); trace_xfs_log_regrant_write_wake1(log, tic); - } + } else + spin_unlock(&log->l_grant_write_lock); } redo: if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; + goto error_return_unlocked; - free_bytes = xlog_space_left(log, log->l_grant_write_cycle, - log->l_grant_write_bytes); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); if (free_bytes < need_bytes) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_write_headq, tic); - spin_unlock(&log->l_grant_lock); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); - - XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_regrant_write_sleep2(log, tic); - - sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); + spin_lock(&log->l_grant_write_lock); + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_writeq); - /* If we're shutting down, this tic is already off the queue */ - spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_regrant_write_sleep2(log, tic); + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); + trace_xfs_log_regrant_write_wake2(log, tic); goto redo; - } else if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_write_headq, tic); + } - /* we've got enough space */ - xlog_grant_add_space_write(log, need_bytes); -#ifdef DEBUG - tail_lsn = log->l_tail_lsn; - if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { - ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); - ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); + if (!list_empty(&tic->t_queue)) { + spin_lock(&log->l_grant_write_lock); + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_write_lock); } -#endif + /* we've got enough space */ + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_regrant_write_exit(log, tic); - - xlog_verify_grant_head(log, 1); - spin_unlock(&log->l_grant_lock); + xlog_verify_grant_tail(log); return 0; + error_return_unlocked: + spin_lock(&log->l_grant_write_lock); error_return: - if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_write_lock); trace_xfs_log_regrant_write_error(log, tic); /* @@ -2778,7 +2714,6 @@ redo: */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_regrant_write_log_space */ @@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log, if (ticket->t_cnt > 0) ticket->t_cnt--; - spin_lock(&log->l_grant_lock); - xlog_grant_sub_space(log, ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_grant_reserve_head, + ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_grant_write_head, + ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); trace_xfs_log_regrant_reserve_sub(log, ticket); - xlog_verify_grant_head(log, 1); - /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) { - spin_unlock(&log->l_grant_lock); + if (ticket->t_cnt > 0) return; - } - xlog_grant_add_space_reserve(log, ticket->t_unit_res); + xlog_grant_add_space(log, &log->l_grant_reserve_head, + ticket->t_unit_res); trace_xfs_log_regrant_reserve_exit(log, ticket); - xlog_verify_grant_head(log, 0); - spin_unlock(&log->l_grant_lock); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); } /* xlog_regrant_reserve_log_space */ @@ -2843,28 +2775,29 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket) { + int bytes; + if (ticket->t_cnt > 0) ticket->t_cnt--; - spin_lock(&log->l_grant_lock); trace_xfs_log_ungrant_enter(log, ticket); - - xlog_grant_sub_space(log, ticket->t_curr_res); - trace_xfs_log_ungrant_sub(log, ticket); - /* If this is a permanent reservation ticket, we may be able to free + /* + * If this is a permanent reservation ticket, we may be able to free * up more space based on the remaining count. */ + bytes = ticket->t_curr_res; if (ticket->t_cnt > 0) { ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); - xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); + bytes += ticket->t_unit_res*ticket->t_cnt; } + xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes); + xlog_grant_sub_space(log, &log->l_grant_write_head, bytes); + trace_xfs_log_ungrant_exit(log, ticket); - xlog_verify_grant_head(log, 1); - spin_unlock(&log->l_grant_lock); xfs_log_move_tail(log->l_mp, 1); } /* xlog_ungrant_log_space */ @@ -2901,11 +2834,11 @@ xlog_state_release_iclog( if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { /* update tail before writing to iclog */ - xlog_assign_tail_lsn(log->l_mp); + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); - xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); /* cycle incremented when incrementing curr_block */ } spin_unlock(&log->l_icloglock); @@ -3088,7 +3021,7 @@ maybe_sleep: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3206,8 +3139,8 @@ try_again: XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_write_wait, - PSWP, &log->l_icloglock, s); + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); if (log_flushed) *log_flushed = 1; already_slept = 1; @@ -3235,7 +3168,7 @@ try_again: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3310,10 +3243,8 @@ xfs_log_ticket_put( xlog_ticket_t *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); - if (atomic_dec_and_test(&ticket->t_ref)) { - sv_destroy(&ticket->t_wait); + if (atomic_dec_and_test(&ticket->t_ref)) kmem_zone_free(xfs_log_ticket_zone, ticket); - } } xlog_ticket_t * @@ -3435,6 +3366,7 @@ xlog_ticket_alloc( } atomic_set(&tic->t_ref, 1); + INIT_LIST_HEAD(&tic->t_queue); tic->t_unit_res = unit_bytes; tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; @@ -3445,7 +3377,7 @@ xlog_ticket_alloc( tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); + init_waitqueue_head(&tic->t_wait); xlog_tic_reset_res(tic); @@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr( } STATIC void -xlog_verify_grant_head(xlog_t *log, int equals) +xlog_verify_grant_tail( + struct log *log) { - if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { - if (equals) - ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); - else - ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); - } else { - ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); - ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); - } -} /* xlog_verify_grant_head */ + int tail_cycle, tail_blocks; + int cycle, space; + + /* + * Check to make sure the grant write head didn't just over lap the + * tail. If the cycles are the same, we can't be overlapping. + * Otherwise, make sure that the cycles differ by exactly one and + * check the byte count. + */ + xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); + if (tail_cycle != cycle) { + ASSERT(cycle - 1 == tail_cycle); + ASSERT(space <= BBTOB(tail_blocks)); + } +} /* check if it will fit */ STATIC void @@ -3716,12 +3655,10 @@ xfs_log_force_umount( xlog_cil_force(log); /* - * We must hold both the GRANT lock and the LOG lock, - * before we mark the filesystem SHUTDOWN and wake - * everybody up to tell the bad news. + * mark the filesystem and the as in a shutdown state and wake + * everybody up to tell them the bad news. */ spin_lock(&log->l_icloglock); - spin_lock(&log->l_grant_lock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; if (mp->m_sb_bp) XFS_BUF_DONE(mp->m_sb_bp); @@ -3742,27 +3679,21 @@ xfs_log_force_umount( spin_unlock(&log->l_icloglock); /* - * We don't want anybody waiting for log reservations - * after this. That means we have to wake up everybody - * queued up on reserve_headq as well as write_headq. - * In addition, we make sure in xlog_{re}grant_log_space - * that we don't enqueue anything once the SHUTDOWN flag - * is set, and this action is protected by the GRANTLOCK. + * We don't want anybody waiting for log reservations after this. That + * means we have to wake up everybody queued up on reserveq as well as + * writeq. In addition, we make sure in xlog_{re}grant_log_space that + * we don't enqueue anything once the SHUTDOWN flag is set, and this + * action is protected by the grant locks. */ - if ((tic = log->l_reserve_headq)) { - do { - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_reserve_headq); - } - - if ((tic = log->l_write_headq)) { - do { - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_write_headq); - } - spin_unlock(&log->l_grant_lock); + spin_lock(&log->l_grant_reserve_lock); + list_for_each_entry(tic, &log->l_reserveq, t_queue) + wake_up(&tic->t_wait); + spin_unlock(&log->l_grant_reserve_lock); + + spin_lock(&log->l_grant_write_lock); + list_for_each_entry(tic, &log->l_writeq, t_queue) + wake_up(&tic->t_wait); + spin_unlock(&log->l_grant_write_lock); if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 23d6ceb5e97..9dc8125d04e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -61,7 +61,7 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); init_rwsem(&cil->xc_ctx_lock); - sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + init_waitqueue_head(&cil->xc_commit_wait); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); @@ -361,15 +361,10 @@ xlog_cil_committed( int abort) { struct xfs_cil_ctx *ctx = args; - struct xfs_log_vec *lv; - int abortflag = abort ? XFS_LI_ABORTED : 0; struct xfs_busy_extent *busyp, *n; - /* unpin all the log items */ - for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { - xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, - abortflag); - } + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + ctx->start_lsn, abort); list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); @@ -568,7 +563,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); goto restart; } } @@ -592,7 +587,7 @@ restart: */ spin_lock(&cil->xc_cil_lock); ctx->commit_lsn = commit_lsn; - sv_broadcast(&cil->xc_commit_wait); + wake_up_all(&cil->xc_commit_wait); spin_unlock(&cil->xc_cil_lock); /* release the hounds! */ @@ -757,7 +752,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); goto restart; } if (ctx->sequence != sequence) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index edcdfe01617..d5f8be8f4bf 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -21,7 +21,6 @@ struct xfs_buf; struct log; struct xlog_ticket; -struct xfs_buf_cancel; struct xfs_mount; /* @@ -54,7 +53,6 @@ struct xfs_mount; BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) - static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) { return ((xfs_lsn_t)cycle << 32) | block; @@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i) */ #define XLOG_TIC_INITED 0x1 /* has been initialized */ #define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ -#define XLOG_TIC_IN_Q 0x4 #define XLOG_TIC_FLAGS \ { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ - { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ - { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" } + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } #endif /* __KERNEL__ */ @@ -244,9 +240,8 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - sv_t t_wait; /* ticket wait queue : 20 */ - struct xlog_ticket *t_next; /* :4|8 */ - struct xlog_ticket *t_prev; /* :4|8 */ + wait_queue_head_t t_wait; /* ticket wait queue */ + struct list_head t_queue; /* reserve/write queue */ xlog_tid_t t_tid; /* transaction identifier : 4 */ atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ @@ -353,8 +348,8 @@ typedef union xlog_in_core2 { * and move everything else out to subsequent cachelines. */ typedef struct xlog_in_core { - sv_t ic_force_wait; - sv_t ic_write_wait; + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; @@ -421,7 +416,7 @@ struct xfs_cil { struct xfs_cil_ctx *xc_ctx; struct rw_semaphore xc_ctx_lock; struct list_head xc_committing; - sv_t xc_commit_wait; + wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; }; @@ -491,7 +486,7 @@ typedef struct log { struct xfs_buftarg *l_targ; /* buftarg of log */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ - struct xfs_buf_cancel **l_buf_cancel_table; + struct list_head *l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ @@ -503,29 +498,40 @@ typedef struct log { int l_logBBsize; /* size of log in BB chunks */ /* The following block of fields are changed while holding icloglock */ - sv_t l_flush_wait ____cacheline_aligned_in_smp; + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ xlog_in_core_t *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ - xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed - * buffers */ - xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ - /* The following block of fields are changed while holding grant_lock */ - spinlock_t l_grant_lock ____cacheline_aligned_in_smp; - xlog_ticket_t *l_reserve_headq; - xlog_ticket_t *l_write_headq; - int l_grant_reserve_cycle; - int l_grant_reserve_bytes; - int l_grant_write_cycle; - int l_grant_write_bytes; + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + /* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ + spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp; + struct list_head l_reserveq; + atomic64_t l_grant_reserve_head; + + spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp; + struct list_head l_writeq; + atomic64_t l_grant_write_head; /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG @@ -534,6 +540,9 @@ typedef struct log { } xlog_t; +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ @@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector, xlog_in_core_t **commit_iclog, uint flags); /* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to smaple and crack LSNs taht are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* * Committed Item List interfaces */ int xlog_cil_init(struct log *log); @@ -585,6 +649,21 @@ xlog_cil_force(struct log *log) */ #define XLOG_UNMOUNT_REC_TYPE (-1U) +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} #endif /* __KERNEL__ */ #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 966d3f97458..aa0ebb77690 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *); #endif /* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +/* * Sector aligned buffer routines for buffer create/read/write/access */ @@ -925,12 +936,12 @@ xlog_find_tail( log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (found == 2) log->l_curr_cycle++; - log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); - log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); - log->l_grant_reserve_cycle = log->l_curr_cycle; - log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); - log->l_grant_write_cycle = log->l_curr_cycle; - log->l_grant_write_bytes = BBTOB(log->l_curr_block); + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, + BBTOB(log->l_curr_block)); /* * Look for unmount record. If we find it, then we know there @@ -960,7 +971,7 @@ xlog_find_tail( } after_umount_blk = (i + hblks + (int) BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; - tail_lsn = log->l_tail_lsn; + tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; @@ -975,12 +986,10 @@ xlog_find_tail( * log records will point recovery to after the * current unmount record. */ - log->l_tail_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); - log->l_last_sync_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; /* @@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans( * record in the table to tell us how many times we expect to see this * record during the second pass. */ -STATIC void -xlog_recover_do_buffer_pass1( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +STATIC int +xlog_recover_buffer_pass1( + struct log *log, + xlog_recover_item_t *item) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *nextp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; - xfs_daddr_t blkno = 0; - uint len = 0; - ushort flags = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - } + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLF_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_buf_not_cancel(log, buf_f); - return; - } - - /* - * Insert an xfs_buf_cancel record into the hash table of - * them. If there is already an identical record, bump - * its reference count. - */ - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - /* - * If the hash bucket is empty then just insert a new record into - * the bucket. - */ - if (*bucket == NULL) { - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; - bcp->bc_refcount = 1; - bcp->bc_next = NULL; - *bucket = bcp; - return; + return 0; } /* - * The hash bucket is not empty, so search for duplicates of our - * record. If we find one them just bump its refcount. If not - * then add us at the end of the list. + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. */ - prevp = NULL; - nextp = *bucket; - while (nextp != NULL) { - if (nextp->bc_blkno == blkno && nextp->bc_len == len) { - nextp->bc_refcount++; + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); - return; + return 0; } - prevp = nextp; - nextp = nextp->bc_next; - } - ASSERT(prevp != NULL); - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; - bcp->bc_next = NULL; - prevp->bc_next = bcp; + list_add_tail(&bcp->bc_list, bucket); + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; } /* @@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1( */ STATIC int xlog_check_buffer_cancelled( - xlog_t *log, + struct log *log, xfs_daddr_t blkno, uint len, ushort flags) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; if (log->l_buf_cancel_table == NULL) { /* @@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled( return 0; } - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - bcp = *bucket; - if (bcp == NULL) { - /* - * There is no corresponding entry in the table built - * in pass one, so this buffer has not been cancelled. - */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; - } - /* - * Search for an entry in the buffer cancel table that - * matches our buffer. + * Search for an entry in the cancel table that matches our buffer. */ - prevp = NULL; - while (bcp != NULL) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) { - /* - * We've go a match, so return 1 so that the - * recovery of this buffer is cancelled. - * If this buffer is actually a buffer cancel - * log item, then decrement the refcount on the - * one in the table and remove it if this is the - * last reference. - */ - if (flags & XFS_BLF_CANCEL) { - bcp->bc_refcount--; - if (bcp->bc_refcount == 0) { - if (prevp == NULL) { - *bucket = bcp->bc_next; - } else { - prevp->bc_next = bcp->bc_next; - } - kmem_free(bcp); - } - } - return 1; - } - prevp = bcp; - bcp = bcp->bc_next; + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + goto found; } + /* - * We didn't find a corresponding entry in the table, so - * return 0 so that the buffer is NOT cancelled. + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; -} -STATIC int -xlog_recover_do_buffer_pass2( - xlog_t *log, - xfs_buf_log_format_t *buf_f) -{ - xfs_daddr_t blkno = 0; - ushort flags = 0; - uint len = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - flags = buf_f->blf_flags; - len = buf_f->blf_len; - break; +found: + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } } - - return xlog_check_buffer_cancelled(log, blkno, len, flags); + return 1; } /* - * Perform recovery for a buffer full of inodes. In these buffers, - * the only data which should be recovered is that which corresponds - * to the di_next_unlinked pointers in the on disk inode structures. - * The rest of the data for the inodes is always logged through the - * inodes themselves rather than the inode buffer and is recovered - * in xlog_recover_do_inode_trans(). + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * - * The only time when buffers full of inodes are fully recovered is - * when the buffer is full of newly allocated inodes. In this case - * the buffer will not be marked as an inode buffer and so will be - * sent to xlog_recover_do_reg_buffer() below during recovery. + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( - xfs_mount_t *mp, + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; - int item_index; - int bit; - int nbits; - int reg_buf_offset; - int reg_buf_bytes; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - unsigned int *data_map = NULL; - unsigned int map_size = 0; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } - /* - * Set the variables corresponding to the current region to - * 0 so that we'll initialize them on the first pass through - * the loop. - */ - reg_buf_offset = 0; - reg_buf_bytes = 0; - bit = 0; - nbits = 0; - item_index = 0; inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + @@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer( * the current di_next_unlinked field. */ bit += nbits; - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ - if (bit == -1) { + if (bit == -1) return 0; - } - nbits = xfs_contig_bits(data_map, map_size, - bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); reg_buf_offset = bit << XFS_BLF_SHIFT; reg_buf_bytes = nbits << XFS_BLF_SHIFT; @@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer( * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ - if (next_unlinked_offset < reg_buf_offset) { + if (next_unlinked_offset < reg_buf_offset) continue; - } ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); @@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer( * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -/*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int bit; int nbits; - unsigned int *data_map = NULL; - unsigned int map_size = 0; int error; trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); if (bit == -1) break; - nbits = xfs_contig_bits(data_map, map_size, bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); @@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer( * for more details on the implementation of the table of cancel records. */ STATIC int -xlog_recover_do_buffer_trans( +xlog_recover_buffer_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; - int cancel; - xfs_daddr_t blkno; - int len; - ushort flags; uint buf_flags; - if (pass == XLOG_RECOVER_PASS1) { - /* - * In this pass we're only looking for buf items - * with the XFS_BLF_CANCEL bit set. - */ - xlog_recover_do_buffer_pass1(log, buf_f); + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; - } else { - /* - * In this pass we want to recover all the buffers - * which have not been cancelled and are not - * cancellation buffers themselves. The routine - * we call here will tell us whether or not to - * continue with the replay of this buffer. - */ - cancel = xlog_recover_do_buffer_pass2(log, buf_f); - if (cancel) { - trace_xfs_log_recover_buf_cancel(log, buf_f); - return 0; - } } + trace_xfs_log_recover_buf_recover(log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - default: - xfs_fs_cmn_err(CE_ALERT, log->l_mp, - "xfs_log_recover: unknown buffer type 0x%x, logdev %s", - buf_f->blf_type, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); - XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", - XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); - } - mp = log->l_mp; buf_flags = XBF_LOCK; - if (!(flags & XFS_BLF_INODE_BUF)) + if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) buf_flags |= XBF_MAPPED; - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags); if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, - bp, blkno); + xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, + bp, buf_f->blf_blkno); error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); return error; } error = 0; - if (flags & XFS_BLF_INODE_BUF) { + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (flags & + } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { @@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans( } STATIC int -xlog_recover_do_inode_trans( +xlog_recover_inode_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { xfs_inode_log_format_t *in_f; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; - xfs_ino_t ino; int len; xfs_caddr_t src; xfs_caddr_t dest; @@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans( xfs_icdinode_t *dicp; int need_free = 0; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { in_f = item->ri_buf[0].i_addr; } else { @@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans( if (error) goto error; } - ino = in_f->ilf_ino; - mp = log->l_mp; /* * Inode buffers can be freed, look out for it, @@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans( xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", - dip, bp, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", + dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; goto error; @@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans( xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", - item, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", + item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; goto error; @@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans( if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; goto error; } @@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans( if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; goto error; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", - item, dip, bp, ino, + item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; goto error; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", - item, dip, bp, ino, dicp->di_forkoff); + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; goto error; } if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, @@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans( break; default: - xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); + xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag"); ASSERT(0); xfs_buf_relse(bp); error = EIO; @@ -2556,18 +2422,11 @@ error: * of that type. */ STATIC int -xlog_recover_do_quotaoff_trans( +xlog_recover_quotaoff_pass1( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { - xfs_qoff_logformat_t *qoff_f; - - if (pass == XLOG_RECOVER_PASS2) { - return (0); - } - - qoff_f = item->ri_buf[0].i_addr; + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans( * Recover a dquot record */ STATIC int -xlog_recover_do_dquot_trans( +xlog_recover_dquot_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; int error; xfs_dq_logformat_t *dq_f; uint type; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - mp = log->l_mp; /* * Filesystems are required to send in quota flags at mount time. @@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans( if ((error = xfs_qm_dqcheck(recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans (log copy)"))) { + "xlog_recover_dquot_pass2 (log copy)"))) { return XFS_ERROR(EIO); } ASSERT(dq_f->qlf_len == 1); @@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans( * minimal initialization then. */ if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans")) { + "xlog_recover_dquot_pass2")) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } @@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans( * LSN. */ STATIC int -xlog_recover_do_efi_trans( +xlog_recover_efi_pass2( xlog_t *log, xlog_recover_item_t *item, - xfs_lsn_t lsn, - int pass) + xfs_lsn_t lsn) { int error; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - efi_formatp = item->ri_buf[0].i_addr; - mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), &(efip->efi_format)))) { xfs_efi_item_free(efip); return error; } - efip->efi_next_extent = efi_formatp->efi_nextents; - efip->efi_flags |= XFS_EFI_COMMITTED; + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); spin_lock(&log->l_ailp->xa_lock); /* * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); return 0; } @@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans( * efd format structure. If we find it, we remove the efi from the * AIL and free it. */ -STATIC void -xlog_recover_do_efd_trans( +STATIC int +xlog_recover_efd_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans( struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; - if (pass == XLOG_RECOVER_PASS1) { - return; - } - efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || @@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans( } xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); -} - -/* - * Perform the transaction - * - * If the transaction modifies a buffer or inode, do it now. Otherwise, - * EFIs and EFDs get queued up by adding entries into the AIL for them. - */ -STATIC int -xlog_recover_do_trans( - xlog_t *log, - xlog_recover_t *trans, - int pass) -{ - int error = 0; - xlog_recover_item_t *item; - - error = xlog_recover_reorder_trans(log, trans, pass); - if (error) - return error; - - list_for_each_entry(item, &trans->r_itemq, ri_list) { - trace_xfs_log_recover_item_recover(log, trans, item, pass); - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - error = xlog_recover_do_buffer_trans(log, item, pass); - break; - case XFS_LI_INODE: - error = xlog_recover_do_inode_trans(log, item, pass); - break; - case XFS_LI_EFI: - error = xlog_recover_do_efi_trans(log, item, - trans->r_lsn, pass); - break; - case XFS_LI_EFD: - xlog_recover_do_efd_trans(log, item, pass); - error = 0; - break; - case XFS_LI_DQUOT: - error = xlog_recover_do_dquot_trans(log, item, pass); - break; - case XFS_LI_QUOTAOFF: - error = xlog_recover_do_quotaoff_trans(log, item, - pass); - break; - default: - xlog_warn( - "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item)); - ASSERT(0); - error = XFS_ERROR(EIO); - break; - } - - if (error) - return error; - } return 0; } @@ -2852,7 +2638,7 @@ xlog_recover_do_trans( */ STATIC void xlog_recover_free_trans( - xlog_recover_t *trans) + struct xlog_recover *trans) { xlog_recover_item_t *item, *n; int i; @@ -2871,17 +2657,95 @@ xlog_recover_free_trans( } STATIC int +xlog_recover_commit_pass1( + struct log *log, + struct xlog_recover *trans, + xlog_recover_item_t *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + /* nothing to do in pass 1 */ + return 0; + default: + xlog_warn( + "XFS: invalid item type (%d) xlog_recover_commit_pass1", + ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_commit_pass2( + struct log *log, + struct xlog_recover *trans, + xlog_recover_item_t *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, item); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, item); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xlog_warn( + "XFS: invalid item type (%d) xlog_recover_commit_pass2", + ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int xlog_recover_commit_trans( - xlog_t *log, - xlog_recover_t *trans, + struct log *log, + struct xlog_recover *trans, int pass) { - int error; + int error = 0; + xlog_recover_item_t *item; hlist_del(&trans->r_list); - if ((error = xlog_recover_do_trans(log, trans, pass))) + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) return error; - xlog_recover_free_trans(trans); /* no error */ + + list_for_each_entry(item, &trans->r_itemq, ri_list) { + if (pass == XLOG_RECOVER_PASS1) + error = xlog_recover_commit_pass1(log, trans, item); + else + error = xlog_recover_commit_pass2(log, trans, item); + if (error) + return error; + } + + xlog_recover_free_trans(trans); return 0; } @@ -3011,7 +2875,7 @@ xlog_recover_process_efi( xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); /* * First check the validity of the extents described by the @@ -3050,7 +2914,7 @@ xlog_recover_process_efi( extp->ext_len); } - efip->efi_flags |= XFS_EFI_RECOVERED; + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp, 0); return error; @@ -3107,7 +2971,7 @@ xlog_recover_process_efis( * Skip EFIs that we've already processed. */ efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_flags & XFS_EFI_RECOVERED) { + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } @@ -3724,7 +3588,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error; + int error, i; ASSERT(head_blk != tail_blk); @@ -3732,10 +3596,12 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = - (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(xfs_buf_cancel_t*), + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { @@ -3754,7 +3620,7 @@ xlog_do_log_recovery( int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(log->l_buf_cancel_table[i] == NULL); + ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif /* DEBUG */ @@ -3934,7 +3800,7 @@ xlog_recover_finish( log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", + "Ending clean XFS mount for filesystem: %s\n", log->l_mp->m_fsname); } return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 19e9dfa1c25..d447aef84bc 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -472,7 +472,7 @@ xfs_initialize_perag( goto out_unwind; pag->pag_agno = index; pag->pag_mount = mp; - rwlock_init(&pag->pag_ici_lock); + spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); spin_lock_init(&pag->pag_buf_lock); @@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp) } /* + * precalculate the low space thresholds for dynamic speculative preallocation. + */ +void +xfs_set_low_space_thresholds( + struct xfs_mount *mp) +{ + int i; + + for (i = 0; i < XFS_LOWSP_MAX; i++) { + __uint64_t space = mp->m_sb.sb_dblocks; + + do_div(space, 100); + mp->m_low_space[i] = space * (i + 1); + } +} + + +/* * Set whether we're using inode alignment. */ STATIC void @@ -1196,6 +1214,9 @@ xfs_mountfs( */ xfs_set_rw_sizes(mp); + /* set the low space thresholds for dynamic preallocation */ + xfs_set_low_space_thresholds(mp); + /* * Set the inode cluster size. * This may still be overridden by the file system diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 5861b498074..a62e8971539 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, xfs_mod_incore_sb(mp, field, delta, rsvd) #endif +/* dynamic preallocation free space thresholds, 5% down to 1% */ +enum { + XFS_LOWSP_1_PCNT = 0, + XFS_LOWSP_2_PCNT, + XFS_LOWSP_3_PCNT, + XFS_LOWSP_4_PCNT, + XFS_LOWSP_5_PCNT, + XFS_LOWSP_MAX, +}; + typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ @@ -202,6 +212,8 @@ typedef struct xfs_mount { __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ + int64_t m_low_space[XFS_LOWSP_MAX]; + /* low free space thresholds */ } xfs_mount_t; /* @@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); +extern void xfs_set_low_space_thresholds(struct xfs_mount *); + #endif /* __KERNEL__ */ extern void xfs_mod_sb(struct xfs_trans *, __int64_t); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 45ce15dc5b2..edfa178bafb 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -408,7 +408,7 @@ xfs_mru_cache_flush( spin_lock(&mru->lock); if (mru->queued) { spin_unlock(&mru->lock); - cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); + cancel_delayed_work_sync(&mru->work); spin_lock(&mru->lock); } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f6d956b7711..33dbc4e0ad6 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1137,7 +1137,7 @@ out_undo_fdblocks: if (blkdelta) xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); out: - ASSERT(error = 0); + ASSERT(error == 0); return; } @@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs( * they could be immediately flushed and we'd have to race with the flusher * trying to pull the item from the AIL as we add it. */ -void +static void xfs_trans_item_committed( struct xfs_log_item *lip, xfs_lsn_t commit_lsn, @@ -1425,6 +1425,83 @@ xfs_trans_committed( xfs_trans_free(tp); } +static inline void +xfs_log_item_batch_insert( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) + IOP_UNPIN(log_items[i], 0); +} + +/* + * Bulk operation version of xfs_trans_committed that takes a log vector of + * items to insert into the AIL. This uses bulk AIL insertion techniques to + * minimise lock traffic. + */ +void +xfs_trans_committed_bulk( + struct xfs_ail *ailp, + struct xfs_log_vec *log_vector, + xfs_lsn_t commit_lsn, + int aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + int i = 0; + + /* unpin all the log items */ + for (lv = log_vector; lv; lv = lv->lv_next ) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* item_lsn of -1 means the item was freed */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. + */ + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->xa_lock); + IOP_UNPIN(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xfs_log_item_batch_insert(ailp, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn); +} + /* * Called from the trans_commit code when we notice that * the filesystem is in the middle of a forced shutdown. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 246286b77a8..c2042b736b8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -294,8 +294,8 @@ struct xfs_log_item_desc { #define XFS_ALLOC_BTREE_REF 2 #define XFS_BMAP_BTREE_REF 2 #define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 -#define XFS_INO_REF 1 #define XFS_DQUOT_REF 1 #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index dc9069568ff..c5bbbc45db9 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -28,8 +28,8 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); +STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t); +STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); @@ -449,129 +449,152 @@ xfs_trans_unlocked_item( xfs_log_move_tail(ailp->xa_mount, 1); } /* xfs_trans_unlocked_item */ - /* - * Update the position of the item in the AIL with the new - * lsn. If it is not yet in the AIL, add it. Otherwise, move - * it to its new position by removing it and re-adding it. + * xfs_trans_ail_update - bulk AIL insertion operation. + * + * @xfs_trans_ail_update takes an array of log items that all need to be + * positioned at the same LSN in the AIL. If an item is not in the AIL, it will + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. If we move the first item in the AIL, update the log tail to + * match the new minimum LSN in the AIL. * - * Wakeup anyone with an lsn less than the item's lsn. If the item - * we move in the AIL is the minimum one, update the tail lsn in the - * log manager. + * This function takes the AIL lock once to execute the update operations on + * all the items in the array, and as such should not be called with the AIL + * lock held. As a result, once we have the AIL lock, we need to check each log + * item LSN to confirm it needs to be moved forward in the AIL. * - * This function must be called with the AIL lock held. The lock - * is dropped before returning. + * To optimise the insert operation, we delete all the items from the AIL in + * the first pass, moving them into a temporary list, then splice the temporary + * list into the correct position in the AIL. This avoids needing to do an + * insert operation on every item. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. */ void -xfs_trans_ail_update( - struct xfs_ail *ailp, - xfs_log_item_t *lip, - xfs_lsn_t lsn) __releases(ailp->xa_lock) +xfs_trans_ail_update_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock) { - xfs_log_item_t *dlip = NULL; - xfs_log_item_t *mlip; /* ptr to minimum lip */ + xfs_log_item_t *mlip; xfs_lsn_t tail_lsn; + int mlip_changed = 0; + int i; + LIST_HEAD(tmp); mlip = xfs_ail_min(ailp); - if (lip->li_flags & XFS_LI_IN_AIL) { - dlip = xfs_ail_delete(ailp, lip); - ASSERT(dlip == lip); - xfs_trans_ail_cursor_clear(ailp, dlip); - } else { - lip->li_flags |= XFS_LI_IN_AIL; + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (lip->li_flags & XFS_LI_IN_AIL) { + /* check if we really need to move the item */ + if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) + continue; + + xfs_ail_delete(ailp, lip); + if (mlip == lip) + mlip_changed = 1; + } else { + lip->li_flags |= XFS_LI_IN_AIL; + } + lip->li_lsn = lsn; + list_add(&lip->li_ail, &tmp); } - lip->li_lsn = lsn; - xfs_ail_insert(ailp, lip); + xfs_ail_splice(ailp, &tmp, lsn); - if (mlip == dlip) { - mlip = xfs_ail_min(ailp); - /* - * It is not safe to access mlip after the AIL lock is - * dropped, so we must get a copy of li_lsn before we do - * so. This is especially important on 32-bit platforms - * where accessing and updating 64-bit values like li_lsn - * is not atomic. - */ - tail_lsn = mlip->li_lsn; - spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, tail_lsn); - } else { + if (!mlip_changed) { spin_unlock(&ailp->xa_lock); + return; } - -} /* xfs_trans_update_ail */ + /* + * It is not safe to access mlip after the AIL lock is dropped, so we + * must get a copy of li_lsn before we do so. This is especially + * important on 32-bit platforms where accessing and updating 64-bit + * values like li_lsn is not atomic. + */ + mlip = xfs_ail_min(ailp); + tail_lsn = mlip->li_lsn; + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); +} /* - * Delete the given item from the AIL. It must already be in - * the AIL. + * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL * - * Wakeup anyone with an lsn less than item's lsn. If the item - * we delete in the AIL is the minimum one, update the tail lsn in the - * log manager. + * @xfs_trans_ail_delete_bulk takes an array of log items that all need to + * removed from the AIL. The caller is already holding the AIL lock, and done + * all the checks necessary to ensure the items passed in via @log_items are + * ready for deletion. This includes checking that the items are in the AIL. * - * Clear the IN_AIL flag from the item, reset its lsn to 0, and - * bump the AIL's generation count to indicate that the tree - * has changed. + * For each log item to be removed, unlink it from the AIL, clear the IN_AIL + * flag from the item and reset the item's lsn to 0. If we remove the first + * item in the AIL, update the log tail to match the new minimum LSN in the + * AIL. * - * This function must be called with the AIL lock held. The lock - * is dropped before returning. + * This function will not drop the AIL lock until all items are removed from + * the AIL to minimise the amount of lock traffic on the AIL. This does not + * greatly increase the AIL hold time, but does significantly reduce the amount + * of traffic on the lock, especially during IO completion. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. */ void -xfs_trans_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip) __releases(ailp->xa_lock) +xfs_trans_ail_delete_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items) __releases(ailp->xa_lock) { - xfs_log_item_t *dlip; xfs_log_item_t *mlip; xfs_lsn_t tail_lsn; + int mlip_changed = 0; + int i; - if (lip->li_flags & XFS_LI_IN_AIL) { - mlip = xfs_ail_min(ailp); - dlip = xfs_ail_delete(ailp, lip); - ASSERT(dlip == lip); - xfs_trans_ail_cursor_clear(ailp, dlip); - + mlip = xfs_ail_min(ailp); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + struct xfs_mount *mp = ailp->xa_mount; - if (mlip == dlip) { - mlip = xfs_ail_min(ailp); - /* - * It is not safe to access mlip after the AIL lock - * is dropped, so we must get a copy of li_lsn - * before we do so. This is especially important - * on 32-bit platforms where accessing and updating - * 64-bit values like li_lsn is not atomic. - */ - tail_lsn = mlip ? mlip->li_lsn : 0; - spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, tail_lsn); - } else { spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + return; } + + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + if (mlip == lip) + mlip_changed = 1; } - else { - /* - * If the file system is not being shutdown, we are in - * serious trouble if we get to this stage. - */ - struct xfs_mount *mp = ailp->xa_mount; + if (!mlip_changed) { spin_unlock(&ailp->xa_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, - "%s: attempting to delete a log item that is not in the AIL", - __func__); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + return; } -} - + /* + * It is not safe to access mlip after the AIL lock is dropped, so we + * must get a copy of li_lsn before we do so. This is especially + * important on 32-bit platforms where accessing and updating 64-bit + * values like li_lsn is not atomic. It is possible we've emptied the + * AIL here, so if that is the case, pass an LSN of 0 to the tail move. + */ + mlip = xfs_ail_min(ailp); + tail_lsn = mlip ? mlip->li_lsn : 0; + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); +} /* * The active item list (AIL) is a doubly linked list of log @@ -623,16 +646,13 @@ xfs_trans_ail_destroy( } /* - * Insert the given log item into the AIL. - * We almost always insert at the end of the list, so on inserts - * we search from the end of the list to find where the - * new item belongs. + * splice the log item list into the AIL at the given LSN. */ STATIC void -xfs_ail_insert( +xfs_ail_splice( struct xfs_ail *ailp, - xfs_log_item_t *lip) -/* ARGSUSED */ + struct list_head *list, + xfs_lsn_t lsn) { xfs_log_item_t *next_lip; @@ -640,39 +660,33 @@ xfs_ail_insert( * If the list is empty, just insert the item. */ if (list_empty(&ailp->xa_ail)) { - list_add(&lip->li_ail, &ailp->xa_ail); + list_splice(list, &ailp->xa_ail); return; } list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { - if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) + if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) break; } ASSERT((&next_lip->li_ail == &ailp->xa_ail) || - (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); - - list_add(&lip->li_ail, &next_lip->li_ail); + (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)); - xfs_ail_check(ailp, lip); + list_splice_init(list, &next_lip->li_ail); return; } /* * Delete the given item from the AIL. Return a pointer to the item. */ -/*ARGSUSED*/ -STATIC xfs_log_item_t * +STATIC void xfs_ail_delete( struct xfs_ail *ailp, xfs_log_item_t *lip) -/* ARGSUSED */ { xfs_ail_check(ailp, lip); - list_del(&lip->li_ail); - - return lip; + xfs_trans_ail_cursor_clear(ailp, lip); } /* @@ -682,7 +696,6 @@ xfs_ail_delete( STATIC xfs_log_item_t * xfs_ail_min( struct xfs_ail *ailp) -/* ARGSUSED */ { if (list_empty(&ailp->xa_ail)) return NULL; @@ -699,7 +712,6 @@ STATIC xfs_log_item_t * xfs_ail_next( struct xfs_ail *ailp, xfs_log_item_t *lip) -/* ARGSUSED */ { if (lip->li_ail.next == &ailp->xa_ail) return NULL; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index f783d5e9fa7..f7590f5bade 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; - next_extent = efip->efi_next_extent; + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; ASSERT(next_extent < efip->efi_format.efi_nextents); extp = &(efip->efi_format.efi_extents[next_extent]); extp->ext_start = start_block; extp->ext_len = ext_len; - efip->efi_next_extent++; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 62da86c90de..35162c238fa 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -22,15 +22,17 @@ struct xfs_log_item; struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, int flags); -void xfs_trans_item_committed(struct xfs_log_item *lip, - xfs_lsn_t commit_lsn, int aborted); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); /* * AIL traversal cursor. * @@ -73,12 +75,29 @@ struct xfs_ail { /* * From xfs_trans_ail.c */ -void xfs_trans_ail_update(struct xfs_ail *ailp, - struct xfs_log_item *lip, xfs_lsn_t lsn) - __releases(ailp->xa_lock); -void xfs_trans_ail_delete(struct xfs_ail *ailp, - struct xfs_log_item *lip) - __releases(ailp->xa_lock); +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn); +} + +void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items) + __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) __releases(ailp->xa_lock) +{ + xfs_trans_ail_delete_bulk(ailp, &lip, 1); +} + void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_trans_unlocked_item(struct xfs_ail *, xfs_log_item_t *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 8e4a63c4151..d8e6f8cd6f0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -964,29 +964,48 @@ xfs_release( xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); } - if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (ip->i_d.di_nlink == 0) + return 0; - /* - * If we can't get the iolock just skip truncating - * the blocks past EOF because we could deadlock - * with the mmap_sem otherwise. We'll get another - * chance to drop them once the last reference to - * the inode is dropped, so we'll never leak blocks - * permanently. - */ - error = xfs_free_eofblocks(mp, ip, - XFS_FREE_EOF_TRYLOCK); - if (error) - return error; - } - } + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * oustanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, + XFS_FREE_EOF_TRYLOCK); + if (error) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } return 0; } |