From 118b23022512eb2f41ce42db70dc0568d00be4ba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 24 Aug 2013 12:08:17 -0400 Subject: cope with potentially long ->d_dname() output for shmem/hugetlb dynamic_dname() is both too much and too little for those - the output may be well in excess of 64 bytes dynamic_dname() assumes to be enough (thanks to ashmem feeding really long names to shmem_file_setup()) and vsnprintf() is an overkill for those guys. Signed-off-by: Al Viro --- fs/dcache.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 87bdb5329c3..83cfb834db0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2724,6 +2724,17 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, return memcpy(buffer, temp, sz); } +char *simple_dname(struct dentry *dentry, char *buffer, int buflen) +{ + char *end = buffer + buflen; + /* these dentries are never renamed, so d_lock is not needed */ + if (prepend(&end, &buflen, " (deleted)", 11) || + prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, "/", 1)) + end = ERR_PTR(-ENAMETOOLONG); + return end; +} + /* * Write full pathname from the root of the filesystem into the buffer. */ -- cgit v1.2.3-70-g09d2 From 98474236f72e5a8b89c14cd7c74f0bb77a4b1a99 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 28 Aug 2013 18:24:59 -0700 Subject: vfs: make the dentry cache use the lockref infrastructure This just replaces the dentry count/lock combination with the lockref structure that contains both a count and a spinlock, and does the mechanical conversion to use the lockref infrastructure. There are no semantic changes here, it's purely syntactic. The reference lockref implementation uses the spinlock exactly the same way that the old dcache code did, and the bulk of this patch is just expanding the internal "d_count" use in the dcache code to use "d_lockref.count" instead. This is purely preparation for the real change to make the reference count updates be lockless during the 3.12 merge window. [ As with the previous commit, this is a rewritten version of a concept originally from Waiman, so credit goes to him, blame for any errors goes to me. Waiman's patch had some semantic differences for taking advantage of the lockless update in dget_parent(), while this patch is intentionally a pure search-and-replace change with no semantic changes. - Linus ] Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- fs/dcache.c | 57 ++++++++++++++++++++------------------------------ fs/namei.c | 6 +++--- include/linux/dcache.h | 19 ++++++++--------- 3 files changed, 35 insertions(+), 47 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 83cfb834db0..b949af850cd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_count); + BUG_ON(dentry->d_lockref.count); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -467,7 +467,7 @@ relock: } if (ref) - dentry->d_count--; + dentry->d_lockref.count--; /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. @@ -513,15 +513,10 @@ void dput(struct dentry *dentry) return; repeat: - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) might_sleep(); - spin_lock(&dentry->d_lock); - BUG_ON(!dentry->d_count); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } if (dentry->d_flags & DCACHE_OP_DELETE) { if (dentry->d_op->d_delete(dentry)) @@ -535,7 +530,7 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); - dentry->d_count--; + dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; @@ -590,7 +585,7 @@ int d_invalidate(struct dentry * dentry) * We also need to leave mountpoints alone, * directory or not. */ - if (dentry->d_count > 1 && dentry->d_inode) { + if (dentry->d_lockref.count > 1 && dentry->d_inode) { if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; @@ -606,14 +601,12 @@ EXPORT_SYMBOL(d_invalidate); /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { - dentry->d_count++; + dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { - spin_lock(&dentry->d_lock); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); + lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) @@ -634,8 +627,8 @@ repeat: goto repeat; } rcu_read_unlock(); - BUG_ON(!ret->d_count); - ret->d_count++; + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } @@ -718,7 +711,7 @@ restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_count) { + if (!dentry->d_lockref.count) { __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -763,12 +756,8 @@ static void try_prune_one_dentry(struct dentry *dentry) /* Prune ancestors. */ dentry = parent; while (dentry) { - spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } dentry = dentry_kill(dentry, 1); } } @@ -793,7 +782,7 @@ static void shrink_dentry_list(struct list_head *list) * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; @@ -913,7 +902,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry_lru_del(dentry); __d_shrink(dentry); - if (dentry->d_count != 0) { + if (dentry->d_lockref.count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" " still in use (%d)" @@ -922,7 +911,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry->d_name.name, - dentry->d_count, + dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); BUG(); @@ -933,7 +922,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - parent->d_count--; + parent->d_lockref.count--; list_del(&dentry->d_u.d_child); } @@ -981,7 +970,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - dentry->d_count--; + dentry->d_lockref.count--; shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { @@ -1147,7 +1136,7 @@ resume: * loop in shrink_dcache_parent() might not make any progress * and loop forever. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { dentry_lru_move_list(dentry, dispose); @@ -1269,7 +1258,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_wmb(); dentry->d_name.name = dname; - dentry->d_count = 1; + dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); @@ -1970,7 +1959,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) goto next; } - dentry->d_count++; + dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -2069,7 +2058,7 @@ again: spin_lock(&dentry->d_lock); inode = dentry->d_inode; isdir = S_ISDIR(inode->i_mode); - if (dentry->d_count == 1) { + if (dentry->d_lockref.count == 1) { if (!spin_trylock(&inode->i_lock)) { spin_unlock(&dentry->d_lock); cpu_relax(); @@ -2948,7 +2937,7 @@ resume: } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_count--; + dentry->d_lockref.count--; } spin_unlock(&dentry->d_lock); } @@ -2956,7 +2945,7 @@ resume: struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_count--; + this_parent->d_lockref.count--; } this_parent = try_to_ascend(this_parent, locked, seq); if (!this_parent) diff --git a/fs/namei.c b/fs/namei.c index 8b61d103a8a..7720fbd5277 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -536,8 +536,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) * a reference at this point. */ BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; + BUG_ON(!parent->d_lockref.count); + parent->d_lockref.count++; spin_unlock(&dentry->d_lock); } spin_unlock(&parent->d_lock); @@ -3327,7 +3327,7 @@ void dentry_unhash(struct dentry *dentry) { shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4a12532da8c..efdc94434c3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -9,6 +9,7 @@ #include #include #include +#include struct nameidata; struct path; @@ -100,6 +101,8 @@ extern unsigned int full_name_hash(const unsigned char *, unsigned int); # endif #endif +#define d_lock d_lockref.lock + struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ @@ -112,8 +115,7 @@ struct dentry { unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ /* Ref lookup also touches following */ - unsigned int d_count; /* protected by d_lock */ - spinlock_t d_lock; /* per dentry lock */ + struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ @@ -318,7 +320,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) assert_spin_locked(&dentry->d_lock); if (!read_seqcount_retry(&dentry->d_seq, seq)) { ret = 1; - dentry->d_count++; + dentry->d_lockref.count++; } return ret; @@ -326,7 +328,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) static inline unsigned d_count(const struct dentry *dentry) { - return dentry->d_count; + return dentry->d_lockref.count; } /* validate "insecure" dentry pointer */ @@ -357,17 +359,14 @@ extern char *dentry_path(struct dentry *, char *, int); static inline struct dentry *dget_dlock(struct dentry *dentry) { if (dentry) - dentry->d_count++; + dentry->d_lockref.count++; return dentry; } static inline struct dentry *dget(struct dentry *dentry) { - if (dentry) { - spin_lock(&dentry->d_lock); - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - } + if (dentry) + lockref_get(&dentry->d_lockref); return dentry; } -- cgit v1.2.3-70-g09d2 From df3d0bbcdb2cafa23a70223d806655bd37e64a9b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 2 Sep 2013 11:29:22 -0700 Subject: vfs: use lockref_get_not_zero() for optimistic lockless dget_parent() A valid parent pointer is always going to have a non-zero reference count, but if we look up the parent optimistically without locking, we have to protect against the (very unlikely) race against renaming changing the parent from under us. We do that by using lockref_get_not_zero(), and then re-checking the parent pointer after getting a valid reference. [ This is a re-implementation of a chunk from the original patch by Waiman Long: "dcache: Enable lockless update of dentry's refcount". I've completely rewritten the patch-series and split it up, but I'm attributing this part to Waiman as it's close enough to his earlier patch - Linus ] Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- fs/dcache.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index b949af850cd..2d244227999 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -611,8 +611,23 @@ static inline void __dget(struct dentry *dentry) struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under -- cgit v1.2.3-70-g09d2 From 15570086b590a69d59183b08a7770e316cca20a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Sep 2013 11:38:06 -0700 Subject: vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock() This moves __d_rcu_to_refcount() from into fs/namei.c and re-implements it using the lockref infrastructure instead. It also adds a lot of comments about what is actually going on, because turning a dentry that was looked up using RCU into a long-lived reference counted entry is one of the more subtle parts of the rcu walk. We also used to be _particularly_ subtle in unlazy_walk() where we re-validate both the dentry and its parent using the same sequence count. We used to do it by nesting the locks and then verifying the sequence count just once. That was silly, because nested locking is expensive, but the sequence count check is not. So this just re-validates the dentry and the parent separately, avoiding the nested locking, and making the lockref lookup possible. Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/namei.c | 90 +++++++++++++++++++++++++++++++++++--------------- include/linux/dcache.h | 22 ------------ 3 files changed, 65 insertions(+), 49 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 2d244227999..96655f4f457 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1786,7 +1786,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of diff --git a/fs/namei.c b/fs/namei.c index 7720fbd5277..2c30c84d4ea 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -494,6 +494,50 @@ static inline void unlock_rcu_walk(void) br_read_unlock(&vfsmount_lock); } +/* + * When we move over from the RCU domain to properly refcounted + * long-lived dentries, we need to check the sequence numbers + * we got before lookup very carefully. + * + * We cannot blindly increment a dentry refcount - even if it + * is not locked - if it is zero, because it may have gone + * through the final d_kill() logic already. + * + * So for a zero refcount, we need to get the spinlock (which is + * safe even for a dead dentry because the de-allocation is + * RCU-delayed), and check the sequence count under the lock. + * + * Once we have checked the sequence count, we know it is live, + * and since we hold the spinlock it cannot die from under us. + * + * In contrast, if the reference count wasn't zero, we can just + * increment the lockref without having to take the spinlock. + * Even if the sequence number ends up being stale, we haven't + * gone through the final dput() and killed the dentry yet. + */ +static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq) +{ + int gotref; + + gotref = lockref_get_or_lock(&dentry->d_lockref); + + /* Does the sequence number still match? */ + if (read_seqcount_retry(validate, seq)) { + if (gotref) + dput(dentry); + else + spin_unlock(&dentry->d_lock); + return -ECHILD; + } + + /* Get the ref now, if we couldn't get it originally */ + if (!gotref) { + dentry->d_lockref.count++; + spin_unlock(&dentry->d_lock); + } + return 0; +} + /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -518,29 +562,28 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) nd->root.dentry != fs->root.dentry) goto err_root; } - spin_lock(&parent->d_lock); + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; + if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0) + goto err_root; BUG_ON(nd->inode != parent->d_inode); } else { - if (dentry->d_parent != parent) + if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) + goto err_root; + if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0) goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_lockref.count); - parent->d_lockref.count++; - spin_unlock(&dentry->d_lock); } - spin_unlock(&parent->d_lock); if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); @@ -551,10 +594,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) nd->flags &= ~LOOKUP_RCU; return 0; -err_child: - spin_unlock(&dentry->d_lock); err_parent: - spin_unlock(&parent->d_lock); + dput(dentry); err_root: if (want_root) spin_unlock(&fs->lock); @@ -585,14 +626,11 @@ static int complete_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); + + if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) { unlock_rcu_walk(); return -ECHILD; } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); mntget(nd->path.mnt); unlock_rcu_walk(); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index efdc94434c3..9169b91ea2d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -304,28 +304,6 @@ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); -/** - * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok - * @dentry: dentry to take a ref on - * @seq: seqcount to verify against - * Returns: 0 on failure, else 1. - * - * __d_rcu_to_refcount operates on a dentry,seq pair that was returned - * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. - */ -static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) -{ - int ret = 0; - - assert_spin_locked(&dentry->d_lock); - if (!read_seqcount_retry(&dentry->d_seq, seq)) { - ret = 1; - dentry->d_lockref.count++; - } - - return ret; -} - static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; -- cgit v1.2.3-70-g09d2 From 590fb51f1cf99c4a48a3b1bd65885192e877b561 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 13 Aug 2013 15:42:02 +0800 Subject: vfs: call d_op->d_prune() before unhashing dentry The d_prune dentry operation is used to notify filesystem when VFS about to prune a hashed dentry from the dcache. There are three code paths that prune dentries: shrink_dcache_for_umount_subtree(), prune_dcache_sb() and d_prune_aliases(). For the d_prune_aliases() case, VFS unhashes the dentry first, then call the d_prune dentry operation. This confuses ceph_d_prune() (ceph uses the d_prune dentry operation to maintain a flag indicating whether the complete contents of a directory are in the dcache, pruning unhashed dentry does not affect dir's completeness) This patch fixes the issue by calling the d_prune dentry operation in d_prune_aliases(), before unhashing the dentry. Also make VFS only call the d_prune dentry operation for hashed dentry, to avoid calling the d_prune dentry operation twice when dentry is pruned by d_prune_aliases(). Signed-off-by: Yan, Zheng Signed-off-by: Al Viro --- fs/dcache.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 83cfb834db0..2e5f9ca8328 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -472,7 +472,7 @@ relock: * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -719,6 +719,14 @@ restart: hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_count) { + /* + * inform the fs via d_prune that this dentry + * is about to be unhashed and destroyed. + */ + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) + dentry->d_op->d_prune(dentry); + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -907,7 +915,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * inform the fs that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); -- cgit v1.2.3-70-g09d2 From 01ddc4ede5f09cdaed015d49ab66eec179085a2e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:34 +0200 Subject: vfs: restructure d_genocide() It shouldn't matter when we decrement the refcount during the walk as long as we do it exactly once. Restructure d_genocide() to do the killing on entering the dentry instead of when leaving it. This helps creating a common helper for tree walking. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 5aa53bc056b..f792e9f20ec 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2952,6 +2952,10 @@ resume: spin_unlock(&dentry->d_lock); continue; } + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -2959,18 +2963,10 @@ resume: spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } spin_unlock(&dentry->d_lock); } if (this_parent != root) { struct dentry *child = this_parent; - if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { - this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_lockref.count--; - } this_parent = try_to_ascend(this_parent, locked, seq); if (!this_parent) goto rename_retry; -- cgit v1.2.3-70-g09d2 From db14fc3abcd5dcc9b32ad5b9dd5b8f9e16295a39 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:35 +0200 Subject: vfs: add d_walk() This one replaces three instances open coded tree walking (have_submounts, select_parent, d_genocide) with a common helper. In addition to slightly reducing the kernel size, this simplifies the callers and makes them less bug prone. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 309 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 148 insertions(+), 161 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index f792e9f20ec..043c5b478a7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1031,34 +1031,56 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq return new; } +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * @finish: callback when successfully finished the walk * - * Return true if the parent or its subdirectories contain - * a mount point + * The @enter() and @finish() callbacks are called with d_lock held. */ -int have_submounts(struct dentry *parent) +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) { struct dentry *this_parent; struct list_head *next; unsigned seq; int locked = 0; + enum d_walk_ret ret; + bool retry = true; seq = read_seqbegin(&rename_lock); again: this_parent = parent; - - if (d_mountpoint(parent)) - goto positive; spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } repeat: next = this_parent->d_subdirs.next; resume: @@ -1068,12 +1090,22 @@ resume: next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) { + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: spin_unlock(&dentry->d_lock); - spin_unlock(&this_parent->d_lock); - goto positive; + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; } + if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -1094,26 +1126,61 @@ resume: next = child->d_u.d_child.next; goto resume; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 0; /* No mount points found in tree */ -positive: - if (!locked && read_seqretry(&rename_lock, seq)) + if (!locked && read_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); goto rename_retry; + } + if (finish) + finish(data); + +out_unlock: + spin_unlock(&this_parent->d_lock); if (locked) write_sequnlock(&rename_lock); - return 1; + return; rename_retry: + if (!retry) + return; if (locked) goto again; locked = 1; write_seqlock(&rename_lock); goto again; } + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ + +static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +{ + int *ret = data; + if (d_mountpoint(dentry)) { + *ret = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +int have_submounts(struct dentry *parent) +{ + int ret = 0; + + d_walk(parent, &ret, check_mount, NULL); + + return ret; +} EXPORT_SYMBOL(have_submounts); /* @@ -1130,93 +1197,46 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry *parent, struct list_head *dispose) -{ - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int found = 0; - int locked = 0; - seq = read_seqbegin(&rename_lock); -again: - this_parent = parent; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); +struct select_data { + struct dentry *start; + struct list_head dispose; + int found; +}; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; - found++; - } - /* - * We can return to the caller if we have found some (this - * ensures forward progress). We'll be coming back to find - * the rest. - */ - if (found && need_resched()) { - spin_unlock(&dentry->d_lock); - goto out; - } +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - /* - * Descend a level if the d_subdirs list is non-empty. - */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } + if (data->start == dentry) + goto out; - spin_unlock(&dentry->d_lock); - } /* - * All done at this level ... ascend and resume the search. + * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; + if (dentry->d_lockref.count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + dentry_lru_move_list(dentry, &data->dispose); + dentry->d_flags |= DCACHE_SHRINK_LIST; + data->found++; + ret = D_WALK_NORETRY; } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (data->found && need_resched()) + ret = D_WALK_QUIT; out: - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return found; - -rename_retry: - if (found) - return found; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + return ret; } /** @@ -1225,13 +1245,20 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry * parent) +void shrink_dcache_parent(struct dentry *parent) { - LIST_HEAD(dispose); - int found; + for (;;) { + struct select_data data; - while ((found = select_parent(parent, &dispose)) != 0) { - shrink_dentry_list(&dispose); + INIT_LIST_HEAD(&data.dispose); + data.start = parent; + data.found = 0; + + d_walk(parent, &data, select_collect, NULL); + if (!data.found) + break; + + shrink_dentry_list(&data.dispose); cond_resched(); } } @@ -2928,64 +2955,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -void d_genocide(struct dentry *root) +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; - seq = read_seqbegin(&rename_lock); -again: - this_parent = root; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (d_unhashed(dentry) || !dentry->d_inode) { - spin_unlock(&dentry->d_lock); - continue; - } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - spin_unlock(&dentry->d_lock); } - if (this_parent != root) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return; + return D_WALK_CONTINUE; +} -rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill, NULL); } void d_tmpfile(struct dentry *dentry, struct inode *inode) -- cgit v1.2.3-70-g09d2 From 848ac114e847af3f1f9141c90a39ebe79bdb13b3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 11:44:36 +0200 Subject: vfs: check submounts and drop atomically We check submounts before doing d_drop() on a non-empty directory dentry in NFS (have_submounts()), but we do not exclude a racing mount. Process A: have_submounts() -> returns false Process B: mount() -> success Process A: d_drop() This patch prepares the ground for the fix by doing the following operations all under the same rename lock: have_submounts() shrink_dcache_parent() d_drop() This is actually an optimization since have_submounts() and shrink_dcache_parent() both traverse the same dentry tree separately. Signed-off-by: Miklos Szeredi CC: David Howells CC: Steven Whitehouse CC: Trond Myklebust CC: Greg Kroah-Hartman Signed-off-by: Al Viro --- fs/dcache.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/dcache.h | 1 + 2 files changed, 66 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 043c5b478a7..ce5a7e6d84c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1264,6 +1264,71 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + + if (d_mountpoint(dentry)) { + data->found = -EBUSY; + return D_WALK_QUIT; + } + + return select_collect(_data, dentry); +} + +static void check_and_drop(void *_data) +{ + struct select_data *data = _data; + + if (d_mountpoint(data->start)) + data->found = -EBUSY; + if (!data->found) + __d_drop(data->start); +} + +/** + * check_submounts_and_drop - prune dcache, check for submounts and drop + * + * All done as a single atomic operation relative to has_unlinked_ancestor(). + * Returns 0 if successfully unhashed @parent. If there were submounts then + * return -EBUSY. + * + * @dentry: dentry to prune and drop + */ +int check_submounts_and_drop(struct dentry *dentry) +{ + int ret = 0; + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) { + d_drop(dentry); + goto out; + } + + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = dentry; + data.found = 0; + + d_walk(dentry, &data, check_and_collect, check_and_drop); + ret = data.found; + + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + + if (ret <= 0) + break; + + cond_resched(); + } + +out: + return ret; +} +EXPORT_SYMBOL(check_submounts_and_drop); + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9169b91ea2d..c6d3fe50ff9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -253,6 +253,7 @@ extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int have_submounts(struct dentry *); +extern int check_submounts_and_drop(struct dentry *); /* * This adds the entry to the hash queues. -- cgit v1.2.3-70-g09d2 From eed810076685c77dc9a8c5c3593e641c93caed1c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 5 Sep 2013 14:39:11 +0200 Subject: vfs: check unlinked ancestors before mount We check submounts before doing d_drop() on a non-empty directory dentry in NFS (have_submounts()), but we do not exclude a racing mount. Nor do we prevent mounts to be added to the disconnected subtree using relative paths after the d_drop(). This patch fixes these issues by checking for unlinked (unhashed, non-root) ancestors before proceeding with the mount. This is done with rename seqlock taken for write and with ->d_lock grabbed on each ancestor in turn, including our dentry itself. This ensures that the only one of check_submounts_and_drop() or has_unlinked_ancestor() can succeed. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 33 +++++++++++++++++++++++++++++++++ fs/internal.h | 1 + fs/namespace.c | 11 +++++------ 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index ce5a7e6d84c..761e31bacbc 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1183,6 +1183,39 @@ int have_submounts(struct dentry *parent) } EXPORT_SYMBOL(have_submounts); +/* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. check_submounts_and_drop() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused diff --git a/fs/internal.h b/fs/internal.h index 7c5f01cf619..d2089379552 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,6 +126,7 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); +extern int d_set_mounted(struct dentry *dentry); /* * read_write.c diff --git a/fs/namespace.c b/fs/namespace.c index ad8ea9bc251..5997887cc64 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -611,6 +611,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) { struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); struct mountpoint *mp; + int ret; list_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@ -626,14 +627,12 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) if (!mp) return ERR_PTR(-ENOMEM); - spin_lock(&dentry->d_lock); - if (d_unlinked(dentry)) { - spin_unlock(&dentry->d_lock); + ret = d_set_mounted(dentry); + if (ret) { kfree(mp); - return ERR_PTR(-ENOENT); + return ERR_PTR(ret); } - dentry->d_flags |= DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); + mp->m_dentry = dentry; mp->m_count = 1; list_add(&mp->m_hash, chain); -- cgit v1.2.3-70-g09d2