diff options
Diffstat (limited to 'fs/dcache.c')
-rw-r--r-- | fs/dcache.c | 1064 |
1 files changed, 710 insertions, 354 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index b949af850cd..41000305d71 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -37,6 +37,7 @@ #include <linux/rculist_bl.h> #include <linux/prefetch.h> #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" #include "mount.h" @@ -48,7 +49,7 @@ * - the dcache hash table * s_anon bl list spinlock protects: * - the s_anon list (see __d_drop) - * dcache_lru_lock protects: + * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags @@ -63,7 +64,7 @@ * Ordering: * dentry->d_inode->i_lock * dentry->d_lock - * dcache_lru_lock + * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_anon lock * @@ -81,13 +82,41 @@ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * @lock: sequence lock + * @seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl(lock); +} + +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + read_sequnlock_excl(lock); +} + /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -117,23 +146,47 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + dentry_stat.nr_unused = get_nr_dentry_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -229,7 +282,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_lockref.count); + BUG_ON((int)dentry->d_lockref.count > 0); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -304,50 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. */ -static void dentry_lru_add(struct dentry *dentry) +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) { - if (list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } -static void __dentry_lru_del(struct dentry *dentry) +static void d_lru_del(struct dentry *dentry) { + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); } /* - * Remove a dentry with references from the LRU. + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. */ -static void dentry_lru_del(struct dentry *dentry) +static void d_lru_isolate(struct dentry *dentry) { - if (!list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + list_del_init(&dentry->d_lru); } -static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) { - spin_lock(&dcache_lru_lock); - if (list_empty(&dentry->d_lru)) { - list_add_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - } else { - list_move_tail(&dentry->d_lru, list); + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, list); +} + +/* + * dentry_lru_(add|del)_list) must be called with d_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); +} + +/* + * Remove a dentry with references from the LRU. + * + * If we are on the shrink list, then we can get to try_prune_one_dentry() and + * lose our last reference through the parent walk. In this case, we need to + * remove ourselves from the shrink list, not the LRU. + */ +static void dentry_lru_del(struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) + return d_shrink_del(dentry); + d_lru_del(dentry); } - spin_unlock(&dcache_lru_lock); } /** @@ -443,7 +542,8 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) +static inline struct dentry * +dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; @@ -452,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { relock: - spin_unlock(&dentry->d_lock); - cpu_relax(); + if (unlock_on_failure) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + } return dentry; /* try again with same dentry */ } if (IS_ROOT(dentry)) @@ -466,13 +568,16 @@ relock: goto relock; } - if (ref) - dentry->d_lockref.count--; + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -509,24 +614,22 @@ relock: */ void dput(struct dentry *dentry) { - if (!dentry) + if (unlikely(!dentry)) return; repeat: - if (dentry->d_lockref.count == 1) - might_sleep(); if (lockref_put_or_lock(&dentry->d_lockref)) return; - if (dentry->d_flags & DCACHE_OP_DELETE) { + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + goto kill_it; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; } - /* Unreachable? Get rid of it */ - if (d_unhashed(dentry)) - goto kill_it; - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); @@ -611,8 +714,23 @@ static inline void __dget(struct dentry *dentry) struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under @@ -712,6 +830,14 @@ restart: hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { + /* + * inform the fs via d_prune that this dentry + * is about to be unhashed and destroyed. + */ + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) + dentry->d_op->d_prune(dentry); + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -732,7 +858,7 @@ EXPORT_SYMBOL(d_prune_aliases); * * This may fail if locks cannot be acquired no problem, just try again. */ -static void try_prune_one_dentry(struct dentry *dentry) +static struct dentry * try_prune_one_dentry(struct dentry *dentry) __releases(dentry->d_lock) { struct dentry *parent; @@ -749,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry) * fragmentation. */ if (!parent) - return; + return NULL; if (parent == dentry) - return; + return dentry; /* Prune ancestors. */ dentry = parent; while (dentry) { if (lockref_put_or_lock(&dentry->d_lockref)) - return; + return NULL; dentry = dentry_kill(dentry, 1); } + return NULL; } static void shrink_dentry_list(struct list_head *list) @@ -771,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list) dentry = list_entry_rcu(list->prev, struct dentry, d_lru); if (&dentry->d_lru == list) break; /* empty */ + + /* + * Get the dentry lock, and re-verify that the dentry is + * this on the shrinking list. If it is, we know that + * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. + */ spin_lock(&dentry->d_lock); if (dentry != list_entry(list->prev, struct dentry, d_lru)) { spin_unlock(&dentry->d_lock); @@ -778,76 +911,146 @@ static void shrink_dentry_list(struct list_head *list) } /* + * The dispose list is isolated and dentries are not accounted + * to the LRU here, so we can simply remove it from the list + * here regardless of whether it is referenced or not. + */ + d_shrink_del(dentry); + + /* * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free - * it - just keep it off the LRU list. + * the LRU because of laziness during lookup. Do not free it. */ if (dentry->d_lockref.count) { - dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); - try_prune_one_dentry(dentry); + /* + * If 'try_to_prune()' returns a dentry, it will + * be the same one we passed in, and d_lock will + * have been held the whole time, so it will not + * have been added to any other lists. We failed + * to get the inode lock. + * + * We just add it back to the shrink list. + */ + dentry = try_prune_one_dentry(dentry); rcu_read_lock(); + if (dentry) { + d_shrink_add(dentry, list); + spin_unlock(&dentry->d_lock); + } } rcu_read_unlock(); } +static enum lru_status +dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(dentry); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + d_lru_shrink_move(dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @count: number of entries to try to free + * @nr_to_scan : number of entries to try to free + * @nid: which node to scan for freeable entities * - * Attempt to shrink the superblock dcache LRU by @count entries. This is + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * done when we need more memory an called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -void prune_dcache_sb(struct super_block *sb, int count) +long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) { - struct dentry *dentry; - LIST_HEAD(referenced); - LIST_HEAD(tmp); + LIST_HEAD(dispose); + long freed; -relock: - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&dcache_lru_lock); - cpu_relax(); - goto relock; - } + freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, + &dispose, &nr_to_scan); + shrink_dentry_list(&dispose); + return freed; +} - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - dentry->d_flags |= DCACHE_SHRINK_LIST; - spin_unlock(&dentry->d_lock); - if (!--count) - break; - } - cond_resched_lock(&dcache_lru_lock); - } - if (!list_empty(&referenced)) - list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lru_lock); +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + d_lru_shrink_move(dentry, freeable); + spin_unlock(&dentry->d_lock); - shrink_dentry_list(&tmp); + return LRU_REMOVED; } + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -857,16 +1060,17 @@ relock: */ void shrink_dcache_sb(struct super_block *sb) { - LIST_HEAD(tmp); + long freed; - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - list_splice_init(&sb->s_dentry_lru, &tmp); - spin_unlock(&dcache_lru_lock); - shrink_dentry_list(&tmp); - spin_lock(&dcache_lru_lock); - } - spin_unlock(&dcache_lru_lock); + do { + LIST_HEAD(dispose); + + freed = list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, UINT_MAX); + + this_cpu_sub(nr_dentry_unused, freed); + shrink_dentry_list(&dispose); + } while (freed > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -896,7 +1100,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * inform the fs that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -985,7 +1190,7 @@ void shrink_dcache_for_umount(struct super_block *sb) * the parenthood after dropping the lock and check * that the sequence number still matches. */ -static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) { struct dentry *new = old->d_parent; @@ -999,7 +1204,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq */ if (new != old->d_parent || (old->d_flags & DCACHE_DENTRY_KILLED) || - (!locked && read_seqretry(&rename_lock, seq))) { + need_seqretry(&rename_lock, seq)) { spin_unlock(&new->d_lock); new = NULL; } @@ -1007,34 +1212,55 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq return new; } +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * @finish: callback when successfully finished the walk * - * Return true if the parent or its subdirectories contain - * a mount point + * The @enter() and @finish() callbacks are called with d_lock held. */ -int have_submounts(struct dentry *parent) +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) { struct dentry *this_parent; struct list_head *next; - unsigned seq; - int locked = 0; + unsigned seq = 0; + enum d_walk_ret ret; + bool retry = true; - seq = read_seqbegin(&rename_lock); again: + read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; - - if (d_mountpoint(parent)) - goto positive; spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } repeat: next = this_parent->d_subdirs.next; resume: @@ -1044,12 +1270,22 @@ resume: next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) { + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: spin_unlock(&dentry->d_lock); - spin_unlock(&this_parent->d_lock); - goto positive; + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; } + if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -1064,35 +1300,99 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); + this_parent = try_to_ascend(this_parent, seq); if (!this_parent) goto rename_retry; next = child->d_u.d_child.next; goto resume; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) + if (need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 0; /* No mount points found in tree */ -positive: - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 1; + } + if (finish) + finish(data); + +out_unlock: + spin_unlock(&this_parent->d_lock); + done_seqretry(&rename_lock, seq); + return; rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); + if (!retry) + return; + seq = 1; goto again; } + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ + +static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +{ + int *ret = data; + if (d_mountpoint(dentry)) { + *ret = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +int have_submounts(struct dentry *parent) +{ + int ret = 0; + + d_walk(parent, &ret, check_mount, NULL); + + return ret; +} EXPORT_SYMBOL(have_submounts); /* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. check_submounts_and_drop() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + +/* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level @@ -1106,93 +1406,51 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry *parent, struct list_head *dispose) -{ - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int found = 0; - int locked = 0; - seq = read_seqbegin(&rename_lock); -again: - this_parent = parent; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; +struct select_data { + struct dentry *start; + struct list_head dispose; + int found; +}; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; - found++; - } - /* - * We can return to the caller if we have found some (this - * ensures forward progress). We'll be coming back to find - * the rest. - */ - if (found && need_resched()) { - spin_unlock(&dentry->d_lock); - goto out; - } + if (data->start == dentry) + goto out; + /* + * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. + */ + if (dentry->d_lockref.count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { /* - * Descend a level if the d_subdirs list is non-empty. + * We can't use d_lru_shrink_move() because we + * need to get the global LRU lock and do the + * LRU accounting. */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - - spin_unlock(&dentry->d_lock); + d_lru_del(dentry); + d_shrink_add(dentry, &data->dispose); + data->found++; + ret = D_WALK_NORETRY; } /* - * All done at this level ... ascend and resume the search. + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. */ - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } + if (data->found && need_resched()) + ret = D_WALK_QUIT; out: - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return found; - -rename_retry: - if (found) - return found; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + return ret; } /** @@ -1201,18 +1459,90 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry * parent) +void shrink_dcache_parent(struct dentry *parent) { - LIST_HEAD(dispose); - int found; + for (;;) { + struct select_data data; - while ((found = select_parent(parent, &dispose)) != 0) { - shrink_dentry_list(&dispose); + INIT_LIST_HEAD(&data.dispose); + data.start = parent; + data.found = 0; + + d_walk(parent, &data, select_collect, NULL); + if (!data.found) + break; + + shrink_dentry_list(&data.dispose); cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + + if (d_mountpoint(dentry)) { + data->found = -EBUSY; + return D_WALK_QUIT; + } + + return select_collect(_data, dentry); +} + +static void check_and_drop(void *_data) +{ + struct select_data *data = _data; + + if (d_mountpoint(data->start)) + data->found = -EBUSY; + if (!data->found) + __d_drop(data->start); +} + +/** + * check_submounts_and_drop - prune dcache, check for submounts and drop + * + * All done as a single atomic operation relative to has_unlinked_ancestor(). + * Returns 0 if successfully unhashed @parent. If there were submounts then + * return -EBUSY. + * + * @dentry: dentry to prune and drop + */ +int check_submounts_and_drop(struct dentry *dentry) +{ + int ret = 0; + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) { + d_drop(dentry); + goto out; + } + + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = dentry; + data.found = 0; + + d_walk(dentry, &data, check_and_collect, check_and_drop); + ret = data.found; + + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + + if (ret <= 0) + break; + + cond_resched(); + } + +out: + return ret; +} +EXPORT_SYMBOL(check_submounts_and_drop); + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -1771,7 +2101,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of @@ -2495,9 +2825,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) return 0; } +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * @buffer: buffer pointer + * @buflen: allocated length of the buffer + * @name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { - return prepend(buffer, buflen, name->name, name->len); + const char *dname = ACCESS_ONCE(name->name); + u32 dlen = ACCESS_ONCE(name->len); + char *p; + + if (*buflen < dlen + 1) + return -ENAMETOOLONG; + *buflen -= dlen + 1; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; } /** @@ -2507,7 +2867,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * Caller holds the rename_lock. + * The function will first try to write out the pathname without taking any + * lock other than the RCU read lock to make sure that dentries won't go away. + * It only checks the sequence number of the global rename_lock as any change + * in the dentry's d_seq will be preceded by changes in the rename_lock + * sequence number. If the sequence number had been changed, it will restart + * the whole pathname back-tracing sequence again by taking the rename_lock. + * In this case, there is no need to take the RCU read lock as the recursive + * parent pointer references will keep the dentry chain alive as long as no + * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, @@ -2516,54 +2884,66 @@ static int prepend_path(const struct path *path, struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; struct mount *mnt = real_mount(vfsmnt); - bool slash = false; int error = 0; + unsigned seq = 0; + char *bptr; + int blen; + rcu_read_lock(); +restart: + bptr = *buffer; + blen = *buflen; + read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (!mnt_has_parent(mnt)) - goto global_root; - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - vfsmnt = &mnt->mnt; - continue; + if (mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; + continue; + } + /* + * Filesystems needing to implement special "root names" + * should do so with ->d_dname() + */ + if (IS_ROOT(dentry) && + (dentry->d_name.len != 1 || + dentry->d_name.name[0] != '/')) { + WARN(1, "Root dentry has weird name <%.*s>\n", + (int) dentry->d_name.len, + dentry->d_name.name); + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; } parent = dentry->d_parent; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(buffer, buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (!error) - error = prepend(buffer, buflen, "/", 1); + error = prepend_name(&bptr, &blen, &dentry->d_name); if (error) break; - slash = true; dentry = parent; } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); - if (!error && !slash) - error = prepend(buffer, buflen, "/", 1); - - return error; - -global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } - if (!slash) - error = prepend(buffer, buflen, "/", 1); - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; return error; } @@ -2592,9 +2972,7 @@ char *__d_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -2613,9 +2991,7 @@ char *d_absolute_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error > 1) @@ -2647,6 +3023,16 @@ static int prepend_unreachable(char **buffer, int *buflen) return prepend(buffer, buflen, "(unreachable)", 13); } +static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /** * d_path - return the path of a dentry * @path: path to report @@ -2679,15 +3065,15 @@ char *d_path(const struct path *path, char *buf, int buflen) if (path->dentry->d_op && path->dentry->d_op->d_dname) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - get_fs_root(current->fs, &root); + rcu_read_lock(); + get_fs_root_rcu(current->fs, &root); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); + if (error < 0) res = ERR_PTR(error); - path_put(&root); return res; } EXPORT_SYMBOL(d_path); @@ -2718,10 +3104,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) char *end = buffer + buflen; /* these dentries are never renamed, so d_lock is not needed */ if (prepend(&end, &buflen, " (deleted)", 11) || - prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || prepend(&end, &buflen, "/", 1)) end = ERR_PTR(-ENAMETOOLONG); - return end; + return end; } /* @@ -2729,30 +3115,42 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) */ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) { - char *end = buf + buflen; - char *retval; + char *end, *retval; + int len, seq = 0; + int error = 0; - prepend(&end, &buflen, "\0", 1); + rcu_read_lock(); +restart: + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); if (buflen < 1) goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; - + read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; int error; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(&end, &buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) - goto Elong; + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; retval = end; dentry = parent; } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + if (error) + goto Elong; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -2760,13 +3158,7 @@ Elong: char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { - char *retval; - - write_seqlock(&rename_lock); - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - - return retval; + return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); @@ -2775,7 +3167,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) char *p = NULL; char *retval; - write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; if (prepend(&p, &buflen, "//deleted", 10) != 0) @@ -2783,7 +3174,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) buflen++; } retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; @@ -2791,6 +3181,18 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, + struct path *pwd) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + *pwd = fs->pwd; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just @@ -2813,25 +3215,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; - char *page = (char *) __get_free_page(GFP_USER); + char *page = __getname(); if (!page) return -ENOMEM; - get_fs_root_and_pwd(current->fs, &root, &pwd); + rcu_read_lock(); + get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); error = -ENOENT; br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - char *cwd = page + PAGE_SIZE; - int buflen = PAGE_SIZE; + char *cwd = page + PATH_MAX; + int buflen = PATH_MAX; prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); if (error < 0) goto out; @@ -2844,21 +3246,19 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } error = -ERANGE; - len = PAGE_SIZE + page - cwd; + len = PATH_MAX + page - cwd; if (len <= size) { error = len; if (copy_to_user(buf, cwd, len)) error = -EFAULT; } } else { - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); } out: - path_put(&pwd); - path_put(&root); - free_page((unsigned long) page); + __putname(page); return error; } @@ -2904,68 +3304,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -void d_genocide(struct dentry *root) +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; - seq = read_seqbegin(&rename_lock); -again: - this_parent = root; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (d_unhashed(dentry) || !dentry->d_inode) { - spin_unlock(&dentry->d_lock); - continue; - } - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } - spin_unlock(&dentry->d_lock); } - if (this_parent != root) { - struct dentry *child = this_parent; - if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { - this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_lockref.count--; - } - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return; + return D_WALK_CONTINUE; +} -rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill, NULL); } void d_tmpfile(struct dentry *dentry, struct inode *inode) |