From 5d8943b04bab62614559f308b4e39533ca7f8e08 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Aug 2013 12:09:34 +0400 Subject: afs: get rid of redundant ->d_name.len checks No dentry can get to directory modification methods without having passed either ->lookup() or ->atomic_open(); if name is rejected by those two (or by ->d_hash()) with an error, it won't be seen by anything else. Signed-off-by: Al Viro --- fs/afs/dir.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 0b74d3176ab..646337dc520 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -751,10 +751,6 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) _enter("{%x:%u},{%s},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -816,10 +812,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) _enter("{%x:%u},{%s}", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -936,10 +928,6 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, _enter("{%x:%u},{%s},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -1005,10 +993,6 @@ static int afs_link(struct dentry *from, struct inode *dir, dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -1053,10 +1037,6 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, content); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - ret = -EINVAL; if (strlen(content) >= AFSPATHMAX) goto error; @@ -1127,10 +1107,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dvnode->fid.vid, new_dvnode->fid.vnode, new_dentry->d_name.name); - ret = -ENAMETOOLONG; - if (new_dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(orig_dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); -- cgit v1.2.3-70-g09d2 From dfc59e2c90f780653e7b0b749c2a547a9bb1b2ce Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 6 Sep 2013 16:55:36 -0400 Subject: exportfs: don't assume that ->iterate() won't feed us too long entries On some filesystems it's impossible even with fs corruption, but we'd better not rely on that, what with memcpy() into on-stack array we are doing there. Signed-off-by: Al Viro --- fs/exportfs/expfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 293bc2e47a7..a235f001688 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -231,7 +231,7 @@ static int filldir_one(void * __buf, const char * name, int len, int result = 0; buf->sequence++; - if (buf->ino == ino) { + if (buf->ino == ino && len <= NAME_MAX) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; -- cgit v1.2.3-70-g09d2 From d040790391f292bbe5bc6b990c66af9787c855a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jul 2013 21:12:31 +0400 Subject: prune_super(): sb->s_op is never NULL Signed-off-by: Al Viro --- fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 5536a95186e..f6961ea84c5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -71,7 +71,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) if (!grab_super_passive(sb)) return -1; - if (sb->s_op && sb->s_op->nr_cached_objects) + if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); total_objects = sb->s_nr_dentry_unused + -- cgit v1.2.3-70-g09d2 From 35759521eedf60ce7d3127c5d33953cd2d1bd35f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Sep 2013 13:41:33 -0400 Subject: take unlazy_walk() into umount_lookup_last() ... and massage it a bit to reduce nesting Signed-off-by: Al Viro --- fs/namei.c | 60 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index f415c6683a8..0ab9e6756f3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2256,19 +2256,22 @@ umount_lookup_last(struct nameidata *nd, struct path *path) struct dentry *dentry; struct dentry *dir = nd->path.dentry; - if (unlikely(nd->flags & LOOKUP_RCU)) { - WARN_ON_ONCE(1); - error = -ECHILD; - goto error_check; + /* If we're in rcuwalk, drop out of it to handle last component */ + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL)) { + error = -ECHILD; + goto out; + } } nd->flags &= ~LOOKUP_PARENT; if (unlikely(nd->last_type != LAST_NORM)) { error = handle_dots(nd, nd->last_type); - if (!error) - dentry = dget(nd->path.dentry); - goto error_check; + if (error) + goto out; + dentry = dget(nd->path.dentry); + goto done; } mutex_lock(&dir->d_inode->i_mutex); @@ -2282,28 +2285,28 @@ umount_lookup_last(struct nameidata *nd, struct path *path) dentry = d_alloc(dir, &nd->last); if (!dentry) { error = -ENOMEM; - } else { - dentry = lookup_real(dir->d_inode, dentry, nd->flags); - if (IS_ERR(dentry)) - error = PTR_ERR(dentry); + goto out; } + dentry = lookup_real(dir->d_inode, dentry, nd->flags); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out; } mutex_unlock(&dir->d_inode->i_mutex); -error_check: - if (!error) { - if (!dentry->d_inode) { - error = -ENOENT; - dput(dentry); - } else { - path->dentry = dentry; - path->mnt = mntget(nd->path.mnt); - if (should_follow_link(dentry->d_inode, - nd->flags & LOOKUP_FOLLOW)) - return 1; - follow_mount(path); - } +done: + if (!dentry->d_inode) { + error = -ENOENT; + dput(dentry); + goto out; } + path->dentry = dentry; + path->mnt = mntget(nd->path.mnt); + if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW)) + return 1; + follow_mount(path); + error = 0; +out: terminate_walk(nd); return error; } @@ -2334,15 +2337,6 @@ path_umountat(int dfd, const char *name, struct path *path, unsigned int flags) if (err) goto out; - /* If we're in rcuwalk, drop out of it to handle last component */ - if (nd.flags & LOOKUP_RCU) { - err = unlazy_walk(&nd, NULL); - if (err) { - terminate_walk(&nd); - goto out; - } - } - err = umount_lookup_last(&nd, path); while (err > 0) { void *cookie; -- cgit v1.2.3-70-g09d2 From 197df04c749a07616621b762e699b1fff4102fac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Sep 2013 14:03:27 -0400 Subject: rename user_path_umountat() to user_path_mountpoint_at() ... and move the extern from linux/namei.h to fs/internal.h, along with that of vfs_path_lookup(). Signed-off-by: Al Viro --- fs/internal.h | 3 +++ fs/namei.c | 23 +++++++++++------------ fs/namespace.c | 2 +- include/linux/namei.h | 3 --- 4 files changed, 15 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index d2089379552..2be46ea5dd0 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -45,6 +45,9 @@ extern void __init chrdev_init(void); * namei.c */ extern int __inode_permission(struct inode *, int); +extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); +extern int vfs_path_lookup(struct dentry *, struct vfsmount *, + const char *, unsigned int, struct path *); /* * namespace.c diff --git a/fs/namei.c b/fs/namei.c index 0ab9e6756f3..11184df3307 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2223,7 +2223,7 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd, } /** - * umount_lookup_last - look up last component for umount + * mountpoint_last - look up last component for umount * @nd: pathwalk nameidata - currently pointing at parent directory of "last" * @path: pointer to container for result * @@ -2250,7 +2250,7 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd, * to the link, and nd->path will *not* be put. */ static int -umount_lookup_last(struct nameidata *nd, struct path *path) +mountpoint_last(struct nameidata *nd, struct path *path) { int error = 0; struct dentry *dentry; @@ -2312,17 +2312,16 @@ out: } /** - * path_umountat - look up a path to be umounted + * path_mountpoint - look up a path to be umounted * @dfd: directory file descriptor to start walk from * @name: full pathname to walk * @flags: lookup flags - * @nd: pathwalk nameidata * * Look up the given name, but don't attempt to revalidate the last component. * Returns 0 and "path" will be valid on success; Retuns error otherwise. */ static int -path_umountat(int dfd, const char *name, struct path *path, unsigned int flags) +path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) { struct file *base = NULL; struct nameidata nd; @@ -2337,7 +2336,7 @@ path_umountat(int dfd, const char *name, struct path *path, unsigned int flags) if (err) goto out; - err = umount_lookup_last(&nd, path); + err = mountpoint_last(&nd, path); while (err > 0) { void *cookie; struct path link = *path; @@ -2348,7 +2347,7 @@ path_umountat(int dfd, const char *name, struct path *path, unsigned int flags) err = follow_link(&link, &nd, &cookie); if (err) break; - err = umount_lookup_last(&nd, path); + err = mountpoint_last(&nd, path); put_link(&nd, &link, cookie); } out: @@ -2362,7 +2361,7 @@ out: } /** - * user_path_umountat - lookup a path from userland in order to umount it + * user_path_mountpoint_at - lookup a path from userland in order to umount it * @dfd: directory file descriptor * @name: pathname from userland * @flags: lookup flags @@ -2376,7 +2375,7 @@ out: * Returns 0 and populates "path" on success. */ int -user_path_umountat(int dfd, const char __user *name, unsigned int flags, +user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, struct path *path) { struct filename *s = getname(name); @@ -2385,11 +2384,11 @@ user_path_umountat(int dfd, const char __user *name, unsigned int flags, if (IS_ERR(s)) return PTR_ERR(s); - error = path_umountat(dfd, s->name, path, flags | LOOKUP_RCU); + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); if (unlikely(error == -ECHILD)) - error = path_umountat(dfd, s->name, path, flags); + error = path_mountpoint(dfd, s->name, path, flags); if (unlikely(error == -ESTALE)) - error = path_umountat(dfd, s->name, path, flags | LOOKUP_REVAL); + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); if (likely(!error)) audit_inode(s, path->dentry, 0); diff --git a/fs/namespace.c b/fs/namespace.c index fc2b5226278..25845d1b300 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1321,7 +1321,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - retval = user_path_umountat(AT_FDCWD, name, lookup_flags, &path); + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; mnt = real_mount(path.mnt); diff --git a/include/linux/namei.h b/include/linux/namei.h index cd09751c71a..53c18f00d9f 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -58,7 +58,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; extern int user_path_at(int, const char __user *, unsigned, struct path *); extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); -extern int user_path_umountat(int, const char __user *, unsigned int, struct path *); #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path) #define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path) @@ -71,8 +70,6 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); -extern int vfs_path_lookup(struct dentry *, struct vfsmount *, - const char *, unsigned int, struct path *); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); -- cgit v1.2.3-70-g09d2 From 2d8646510120bb1eb251ae3381e950805a877763 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Sep 2013 20:18:44 -0400 Subject: introduce kern_path_mountpoint() Signed-off-by: Al Viro --- fs/namei.c | 35 ++++++++++++++++++++++++----------- include/linux/namei.h | 1 + 2 files changed, 25 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 11184df3307..e412421210c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2360,6 +2360,20 @@ out: return err; } +static int +filename_mountpoint(int dfd, struct filename *s, struct path *path, + unsigned int flags) +{ + int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); + if (unlikely(error == -ECHILD)) + error = path_mountpoint(dfd, s->name, path, flags); + if (unlikely(error == -ESTALE)) + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); + if (likely(!error)) + audit_inode(s, path->dentry, 0); + return error; +} + /** * user_path_mountpoint_at - lookup a path from userland in order to umount it * @dfd: directory file descriptor @@ -2380,23 +2394,22 @@ user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, { struct filename *s = getname(name); int error; - if (IS_ERR(s)) return PTR_ERR(s); - - error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); - if (unlikely(error == -ECHILD)) - error = path_mountpoint(dfd, s->name, path, flags); - if (unlikely(error == -ESTALE)) - error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); - - if (likely(!error)) - audit_inode(s, path->dentry, 0); - + error = filename_mountpoint(dfd, s, path, flags); putname(s); return error; } +int +kern_path_mountpoint(int dfd, const char *name, struct path *path, + unsigned int flags) +{ + struct filename s = {.name = name}; + return filename_mountpoint(dfd, &s, path, flags); +} +EXPORT_SYMBOL(kern_path_mountpoint); + /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. diff --git a/include/linux/namei.h b/include/linux/namei.h index 53c18f00d9f..8e47bc7a166 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -70,6 +70,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); +extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); -- cgit v1.2.3-70-g09d2 From ac8387199656b019dcef5e3bbe029c2e9b01a195 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Sun, 8 Sep 2013 16:47:23 +0800 Subject: autofs4 - fix device ioctl mount lookup When reconnecting to automounts at startup an autofs ioctl is used to find the device and inode of existing mounts so they can be used to open a file descriptor of possibly covered mounts. At this time the the caller might not yet "own" the mount so it can trigger calling ->d_automount(). This causes automount to hang when trying to reconnect to direct or offset mount types. Consequently kern_path() can't be used but kern_path_mountpoint() can be. Signed-off-by: Ian Kent Cc: Jeff Layton Cc: Al Viro Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 743c7c2c949..0f00da329e7 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -183,13 +183,14 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, return 0; } +/* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(struct path *path, void *data), void *data) { struct path path; - int err = kern_path(pathname, 0, &path); + int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); if (err) return err; err = -ENOENT; @@ -197,10 +198,9 @@ static int find_autofs_mount(const char *pathname, if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); - if (!err) /* already found some */ - path_put(res); *res = path; err = 0; + break; } } if (!follow_up(&path)) @@ -486,12 +486,11 @@ static int autofs_dev_ioctl_askumount(struct file *fp, * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we - * lookup the nameidata of the path and check if it is the - * root of a mount. If a type is given we are looking for - * a particular autofs mount and if we don't find a match - * we return fail. If the located nameidata path is the - * root of a mount we return 1 along with the super magic - * of the mount or 0 otherwise. + * lookup the path and check if it is the root of a mount. + * If a type is given we are looking for a particular autofs + * mount and if we don't find a match we return fail. If the + * located path is the root of a mount we return 1 along with + * the super magic of the mount or 0 otherwise. * * In both cases the the device number (as returned by * new_encode_dev()) is also returned. @@ -519,9 +518,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) - err = kern_path(name, LOOKUP_FOLLOW, &path); + err = kern_path_mountpoint(AT_FDCWD, + name, &path, LOOKUP_FOLLOW); else - err = find_autofs_mount(name, &path, test_by_type, &type); + err = find_autofs_mount(name, &path, + test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); -- cgit v1.2.3-70-g09d2 From 232d2d60aa5469bb097f55728f65146bd49c1d25 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Sep 2013 12:18:13 -0400 Subject: dcache: Translating dentry into pathname without taking rename_lock When running the AIM7's short workload, Linus' lockref patch eliminated most of the spinlock contention. However, there were still some left: 8.46% reaim [kernel.kallsyms] [k] _raw_spin_lock |--42.21%-- d_path | proc_pid_readlink | SyS_readlinkat | SyS_readlink | system_call | __GI___readlink | |--40.97%-- sys_getcwd | system_call | __getcwd The big one here is the rename_lock (seqlock) contention in d_path() and the getcwd system call. This patch will eliminate the need to take the rename_lock while translating dentries into the full pathnames. The need to take the rename_lock is to make sure that no rename operation can be ongoing while the translation is in progress. However, only one thread can take the rename_lock thus blocking all the other threads that need it even though the translation process won't make any change to the dentries. This patch will replace the writer's write_seqlock/write_sequnlock sequence of the rename_lock of the callers of the prepend_path() and __dentry_path() functions with the reader's read_seqbegin/read_seqretry sequence within these 2 functions. As a result, the code will have to retry if one or more rename operations had been performed. In addition, RCU read lock will be taken during the translation process to make sure that no dentries will go away. To prevent live-lock from happening, the code will switch back to take the rename_lock if read_seqretry() fails for three times. To further reduce spinlock contention, this patch does not take the dentry's d_lock when copying the filename from the dentries. Instead, it treats the name pointer and length as unreliable and just copy the string byte-by-byte over until it hits a null byte or the end of string as specified by the length. This should avoid stepping into invalid memory address. The error cases are left to be handled by the sequence number check. The following code re-factoring are also made: 1. Move prepend('/') into prepend_name() to remove one conditional check. 2. Move the global root check in prepend_path() back to the top of the while loop. With this patch, the _raw_spin_lock will now account for only 1.2% of the total CPU cycles for the short workload. This patch also has the effect of reducing the effect of running perf on its profile since the perf command itself can be a heavy user of the d_path() function depending on the complexity of the workload. When taking the perf profile of the high-systime workload, the amount of spinlock contention contributed by running perf without this patch was about 16%. With this patch, the spinlock contention caused by the running of perf will go away and we will have a more accurate perf profile. Signed-off-by: Waiman Long Signed-off-by: Al Viro --- fs/dcache.c | 196 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 761e31bacbc..38b1b0989a1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,6 +88,44 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * lock: sequence lock + * seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) { /* Even */ + *seq = read_seqbegin(lock); + rcu_read_lock(); + } else /* Odd */ + write_seqlock(lock); +} + +/** + * read_seqretry_or_unlock - end a seqretry or lock block & return retry status + * lock : sequence lock + * seq : sequence number + * Return: 1 to retry operation again, 0 to continue + */ +static inline int read_seqretry_or_unlock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) { /* Even */ + rcu_read_unlock(); + if (read_seqretry(lock, *seq)) { + (*seq)++; /* Take writer lock */ + return 1; + } + } else /* Odd */ + write_sequnlock(lock); + return 0; +} + /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -2644,9 +2682,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) return 0; } +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * buffer: buffer pointer + * buflen: allocated length of the buffer + * name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { - return prepend(buffer, buflen, name->name, name->len); + const char *dname = ACCESS_ONCE(name->name); + u32 dlen = ACCESS_ONCE(name->len); + char *p; + + if (*buflen < dlen + 1) + return -ENAMETOOLONG; + *buflen -= dlen + 1; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; } /** @@ -2656,7 +2724,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * Caller holds the rename_lock. + * The function tries to write out the pathname without taking any lock other + * than the RCU read lock to make sure that dentries won't go away. It only + * checks the sequence number of the global rename_lock as any change in the + * dentry's d_seq will be preceded by changes in the rename_lock sequence + * number. If the sequence number had been change, it will restart the whole + * pathname back-tracing sequence again. It performs a total of 3 trials of + * lockless back-tracing sequences before falling back to take the + * rename_lock. */ static int prepend_path(const struct path *path, const struct path *root, @@ -2665,54 +2740,60 @@ static int prepend_path(const struct path *path, struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; struct mount *mnt = real_mount(vfsmnt); - bool slash = false; int error = 0; + unsigned seq = 0; + char *bptr; + int blen; +restart: + bptr = *buffer; + blen = *buflen; + read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (!mnt_has_parent(mnt)) - goto global_root; - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - vfsmnt = &mnt->mnt; - continue; + if (mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; + continue; + } + /* + * Filesystems needing to implement special "root names" + * should do so with ->d_dname() + */ + if (IS_ROOT(dentry) && + (dentry->d_name.len != 1 || + dentry->d_name.name[0] != '/')) { + WARN(1, "Root dentry has weird name <%.*s>\n", + (int) dentry->d_name.len, + dentry->d_name.name); + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; } parent = dentry->d_parent; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(buffer, buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (!error) - error = prepend(buffer, buflen, "/", 1); + error = prepend_name(&bptr, &blen, &dentry->d_name); if (error) break; - slash = true; dentry = parent; } + if (read_seqretry_or_unlock(&rename_lock, &seq)) + goto restart; - if (!error && !slash) - error = prepend(buffer, buflen, "/", 1); - - return error; - -global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } - if (!slash) - error = prepend(buffer, buflen, "/", 1); - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; return error; } @@ -2741,9 +2822,7 @@ char *__d_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -2762,9 +2841,7 @@ char *d_absolute_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error > 1) @@ -2830,9 +2907,7 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); @@ -2867,10 +2942,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) char *end = buffer + buflen; /* these dentries are never renamed, so d_lock is not needed */ if (prepend(&end, &buflen, " (deleted)", 11) || - prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || prepend(&end, &buflen, "/", 1)) end = ERR_PTR(-ENAMETOOLONG); - return end; + return end; } /* @@ -2878,30 +2953,36 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) */ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) { - char *end = buf + buflen; - char *retval; + char *end, *retval; + int len, seq = 0; + int error = 0; - prepend(&end, &buflen, "\0", 1); +restart: + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); if (buflen < 1) goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; - + read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; int error; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(&end, &buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) - goto Elong; + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; retval = end; dentry = parent; } + if (read_seqretry_or_unlock(&rename_lock, &seq)) + goto restart; + if (error) + goto Elong; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -2909,13 +2990,7 @@ Elong: char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { - char *retval; - - write_seqlock(&rename_lock); - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - - return retval; + return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); @@ -2924,7 +2999,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) char *p = NULL; char *retval; - write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; if (prepend(&p, &buflen, "//deleted", 10) != 0) @@ -2932,7 +3006,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) buflen++; } retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; @@ -2971,7 +3044,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -ENOENT; br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; char *cwd = page + PAGE_SIZE; @@ -2979,7 +3051,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -3000,7 +3071,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -EFAULT; } } else { - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); } -- cgit v1.2.3-70-g09d2 From 48f5ec21d9c67e881ff35343988e290ef5cf933f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Sep 2013 15:22:25 -0400 Subject: split read_seqretry_or_unlock(), convert d_walk() to resulting primitives Separate "check if we need to retry" from "unlock if we are done and had seq_writelock"; that allows to use these guys in d_walk(), where we need to recheck every time we ascend back to parent, but do *not* want to unlock until the very end. Lift rcu_read_lock/rcu_read_unlock out into callers. Signed-off-by: Al Viro --- fs/dcache.c | 64 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 38b1b0989a1..b9caf47d538 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -100,30 +100,21 @@ static struct kmem_cache *dentry_cache __read_mostly; */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { - if (!(*seq & 1)) { /* Even */ + if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); - rcu_read_lock(); - } else /* Odd */ + else /* Odd */ write_seqlock(lock); } -/** - * read_seqretry_or_unlock - end a seqretry or lock block & return retry status - * lock : sequence lock - * seq : sequence number - * Return: 1 to retry operation again, 0 to continue - */ -static inline int read_seqretry_or_unlock(seqlock_t *lock, int *seq) +static inline int need_seqretry(seqlock_t *lock, int seq) { - if (!(*seq & 1)) { /* Even */ - rcu_read_unlock(); - if (read_seqretry(lock, *seq)) { - (*seq)++; /* Take writer lock */ - return 1; - } - } else /* Odd */ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) write_sequnlock(lock); - return 0; } /* @@ -1047,7 +1038,7 @@ void shrink_dcache_for_umount(struct super_block *sb) * the parenthood after dropping the lock and check * that the sequence number still matches. */ -static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) { struct dentry *new = old->d_parent; @@ -1061,7 +1052,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq */ if (new != old->d_parent || (old->d_flags & DCACHE_DENTRY_KILLED) || - (!locked && read_seqretry(&rename_lock, seq))) { + need_seqretry(&rename_lock, seq)) { spin_unlock(&new->d_lock); new = NULL; } @@ -1098,13 +1089,12 @@ static void d_walk(struct dentry *parent, void *data, { struct dentry *this_parent; struct list_head *next; - unsigned seq; - int locked = 0; + unsigned seq = 0; enum d_walk_ret ret; bool retry = true; - seq = read_seqbegin(&rename_lock); again: + read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); @@ -1158,13 +1148,13 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); + this_parent = try_to_ascend(this_parent, seq); if (!this_parent) goto rename_retry; next = child->d_u.d_child.next; goto resume; } - if (!locked && read_seqretry(&rename_lock, seq)) { + if (need_seqretry(&rename_lock, seq)) { spin_unlock(&this_parent->d_lock); goto rename_retry; } @@ -1173,17 +1163,13 @@ resume: out_unlock: spin_unlock(&this_parent->d_lock); - if (locked) - write_sequnlock(&rename_lock); + done_seqretry(&rename_lock, seq); return; rename_retry: if (!retry) return; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); + seq = 1; goto again; } @@ -2745,6 +2731,7 @@ static int prepend_path(const struct path *path, char *bptr; int blen; + rcu_read_lock(); restart: bptr = *buffer; blen = *buflen; @@ -2783,8 +2770,13 @@ restart: dentry = parent; } - if (read_seqretry_or_unlock(&rename_lock, &seq)) + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; goto restart; + } + done_seqretry(&rename_lock, seq); if (error >= 0 && bptr == *buffer) { if (--blen < 0) @@ -2957,6 +2949,7 @@ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) int len, seq = 0; int error = 0; + rcu_read_lock(); restart: end = buf + buflen; len = buflen; @@ -2979,8 +2972,13 @@ restart: retval = end; dentry = parent; } - if (read_seqretry_or_unlock(&rename_lock, &seq)) + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; goto restart; + } + done_seqretry(&rename_lock, seq); if (error) goto Elong; return retval; -- cgit v1.2.3-70-g09d2