10 files changed, 674 insertions, 246 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 33fa3e5d38f..2e994efe12c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -9,23 +9,30 @@ be able to use diff(1).
 
 --------------------------- dentry_operations --------------------------
 prototypes:
-	int (*d_revalidate)(struct dentry *, int);
-	int (*d_hash) (struct dentry *, struct qstr *);
-	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
+	int (*d_revalidate)(struct dentry *, struct nameidata *);
+	int (*d_hash)(const struct dentry *, const struct inode *,
+			struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct inode *,
+			const struct dentry *, const struct inode *,
+			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
+	struct vfsmount *(*d_automount)(struct path *path);
+	int (*d_manage)(struct dentry *, bool);
 
 locking rules:
-		dcache_lock	rename_lock	->d_lock	may block
-d_revalidate:	no		no		no		yes
-d_hash		no		no		no		yes
-d_compare:	no		yes		no		no 
-d_delete:	yes		no		yes		no
-d_release:	no		no		no		yes
-d_iput:		no		no		no		yes
+		rename_lock	->d_lock	may block	rcu-walk
+d_revalidate:	no		no		yes (ref-walk)	maybe
+d_hash		no		no		no		maybe
+d_compare:	yes		no		no		maybe
+d_delete:	no		yes		no		no
+d_release:	no		no		yes		no
+d_iput:		no		no		yes		no
 d_dname:	no		no		no		no
+d_automount:	no		no		yes		no
+d_manage:	no		no		yes (ref-walk)	maybe
 
 --------------------------- inode_operations --------------------------- 
 prototypes:
@@ -44,8 +51,8 @@ ata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int, struct nameidata *);
-	int (*check_acl)(struct inode *, int);
+	int (*permission) (struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -53,7 +60,6 @@ ata *);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
-	long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 
 locking rules:
@@ -73,7 +79,7 @@ follow_link:	no
 put_link:	no
 truncate:	yes		(see below)
 setattr:	yes
-permission:	no
+permission:	no (may not block if called in rcu-walk mode)
 check_acl:	no
 getattr:	no
 setxattr:	yes
@@ -81,7 +87,6 @@ getxattr:	no
 listxattr:	no
 removexattr:	yes
 truncate_range:	yes
-fallocate:	no
 fiemap:		no
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
@@ -161,13 +166,11 @@ prototypes:
 	void (*kill_sb) (struct super_block *);
 locking rules:
 		may block
-get_sb		yes
 mount		yes
 kill_sb		yes
 
-->get_sb() returns error or 0 with locked superblock attached to the vfsmount
-(exclusive on ->s_umount).
-->mount() returns ERR_PTR or the root dentry.
+->mount() returns ERR_PTR or the root dentry; its superblock should be locked
+on return.
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.
 
@@ -340,7 +343,6 @@ prototypes:
 	int (*fl_grant)(struct file_lock *, struct file_lock *, int);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *); /* break_lease callback */
-	int (*fl_mylease)(struct file_lock *, struct file_lock *);
 	int (*fl_change)(struct file_lock **, int);
 
 locking rules:
@@ -350,7 +352,6 @@ fl_notify:		yes		no
 fl_grant:		no		no
 fl_release_private:	maybe		no
 fl_break:		yes		no
-fl_mylease:		yes		no
 fl_change		yes		no
 
 --------------------------- buffer_head -----------------------------------
@@ -432,6 +433,7 @@ prototypes:
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
 			size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	long (*fallocate)(struct file *, int, loff_t, loff_t);
 };
 
 locking rules:
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
deleted file mode 100644
index 79334ed5daa..00000000000
--- a/Documentation/filesystems/dentry-locking.txt
+++ /dev/null
@@ -1,174 +0,0 @@
-RCU-based dcache locking model
-==============================
-
-On many workloads, the most common operation on dcache is to look up a
-dentry, given a parent dentry and the name of the child. Typically,
-for every open(), stat() etc., the dentry corresponding to the
-pathname will be looked up by walking the tree starting with the first
-component of the pathname and using that dentry along with the next
-component to look up the next level and so on. Since it is a frequent
-operation for workloads like multiuser environments and web servers,
-it is important to optimize this path.
-
-Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in
-every component during path look-up. Since 2.5.10 onwards, fast-walk
-algorithm changed this by holding the dcache_lock at the beginning and
-walking as many cached path component dentries as possible. This
-significantly decreases the number of acquisition of
-dcache_lock. However it also increases the lock hold time
-significantly and affects performance in large SMP machines. Since
-2.5.62 kernel, dcache has been using a new locking model that uses RCU
-to make dcache look-up lock-free.
-
-The current dcache locking model is not very different from the
-existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock
-protected the hash chain, d_child, d_alias, d_lru lists as well as
-d_inode and several other things like mount look-up. RCU-based changes
-affect only the way the hash chain is protected. For everything else
-the dcache_lock must be taken for both traversing as well as
-updating. The hash chain updates too take the dcache_lock.  The
-significant change is the way d_lookup traverses the hash chain, it
-doesn't acquire the dcache_lock for this and rely on RCU to ensure
-that the dentry has not been *freed*.
-
-
-Dcache locking details
-======================
-
-For many multi-user workloads, open() and stat() on files are very
-frequently occurring operations. Both involve walking of path names to
-find the dentry corresponding to the concerned file. In 2.4 kernel,
-dcache_lock was held during look-up of each path component. Contention
-and cache-line bouncing of this global lock caused significant
-scalability problems. With the introduction of RCU in Linux kernel,
-this was worked around by making the look-up of path components during
-path walking lock-free.
-
-
-Safe lock-free look-up of dcache hash table
-===========================================
-
-Dcache is a complex data structure with the hash table entries also
-linked together in other lists. In 2.4 kernel, dcache_lock protected
-all the lists. We applied RCU only on hash chain walking. The rest of
-the lists are still protected by dcache_lock.  Some of the important
-changes are :
-
-1. The deletion from hash chain is done using hlist_del_rcu() macro
-   which doesn't initialize next pointer of the deleted dentry and
-   this allows us to walk safely lock-free while a deletion is
-   happening.
-
-2. Insertion of a dentry into the hash table is done using
-   hlist_add_head_rcu() which take care of ordering the writes - the
-   writes to the dentry must be visible before the dentry is
-   inserted. This works in conjunction with hlist_for_each_rcu(),
-   which has since been replaced by hlist_for_each_entry_rcu(), while
-   walking the hash chain. The only requirement is that all
-   initialization to the dentry must be done before
-   hlist_add_head_rcu() since we don't have dcache_lock protection
-   while traversing the hash chain. This isn't different from the
-   existing code.
-
-3. The dentry looked up without holding dcache_lock by cannot be
-   returned for walking if it is unhashed. It then may have a NULL
-   d_inode or other bogosity since RCU doesn't protect the other
-   fields in the dentry. We therefore use a flag DCACHE_UNHASHED to
-   indicate unhashed dentries and use this in conjunction with a
-   per-dentry lock (d_lock). Once looked up without the dcache_lock,
-   we acquire the per-dentry lock (d_lock) and check if the dentry is
-   unhashed. If so, the look-up is failed. If not, the reference count
-   of the dentry is increased and the dentry is returned.
-
-4. Once a dentry is looked up, it must be ensured during the path walk
-   for that component it doesn't go away. In pre-2.5.10 code, this was
-   done holding a reference to the dentry. dcache_rcu does the same.
-   In some sense, dcache_rcu path walking looks like the pre-2.5.10
-   version.
-
-5. All dentry hash chain updates must take the dcache_lock as well as
-   the per-dentry lock in that order. dput() does this to ensure that
-   a dentry that has just been looked up in another CPU doesn't get
-   deleted before dget() can be done on it.
-
-6. There are several ways to do reference counting of RCU protected
-   objects. One such example is in ipv4 route cache where deferred
-   freeing (using call_rcu()) is done as soon as the reference count
-   goes to zero. This cannot be done in the case of dentries because
-   tearing down of dentries require blocking (dentry_iput()) which
-   isn't supported from RCU callbacks. Instead, tearing down of
-   dentries happen synchronously in dput(), but actual freeing happens
-   later when RCU grace period is over. This allows safe lock-free
-   walking of the hash chains, but a matched dentry may have been
-   partially torn down. The checking of DCACHE_UNHASHED flag with
-   d_lock held detects such dentries and prevents them from being
-   returned from look-up.
-
-
-Maintaining POSIX rename semantics
-==================================
-
-Since look-up of dentries is lock-free, it can race against a
-concurrent rename operation. For example, during rename of file A to
-B, look-up of either A or B must succeed.  So, if look-up of B happens
-after A has been removed from the hash chain but not added to the new
-hash chain, it may fail.  Also, a comparison while the name is being
-written concurrently by a rename may result in false positive matches
-violating rename semantics.  Issues related to race with rename are
-handled as described below :
-
-1. Look-up can be done in two ways - d_lookup() which is safe from
-   simultaneous renames and __d_lookup() which is not.  If
-   __d_lookup() fails, it must be followed up by a d_lookup() to
-   correctly determine whether a dentry is in the hash table or
-   not. d_lookup() protects look-ups using a sequence lock
-   (rename_lock).
-
-2. The name associated with a dentry (d_name) may be changed if a
-   rename is allowed to happen simultaneously. To avoid memcmp() in
-   __d_lookup() go out of bounds due to a rename and false positive
-   comparison, the name comparison is done while holding the
-   per-dentry lock. This prevents concurrent renames during this
-   operation.
-
-3. Hash table walking during look-up may move to a different bucket as
-   the current dentry is moved to a different bucket due to rename.
-   But we use hlists in dcache hash table and they are
-   null-terminated.  So, even if a dentry moves to a different bucket,
-   hash chain walk will terminate. [with a list_head list, it may not
-   since termination is when the list_head in the original bucket is
-   reached].  Since we redo the d_parent check and compare name while
-   holding d_lock, lock-free look-up will not race against d_move().
-
-4. There can be a theoretical race when a dentry keeps coming back to
-   original bucket due to double moves. Due to this look-up may
-   consider that it has never moved and can end up in a infinite loop.
-   But this is not any worse that theoretical livelocks we already
-   have in the kernel.
-
-
-Important guidelines for filesystem developers related to dcache_rcu
-====================================================================
-
-1. Existing dcache interfaces (pre-2.5.62) exported to filesystem
-   don't change. Only dcache internal implementation changes. However
-   filesystems *must not* delete from the dentry hash chains directly
-   using the list macros like allowed earlier. They must use dcache
-   APIs like d_drop() or __d_drop() depending on the situation.
-
-2. d_flags is now protected by a per-dentry lock (d_lock). All access
-   to d_flags must be protected by it.
-
-3. For a hashed dentry, checking of d_count needs to be protected by
-   d_lock.
-
-
-Papers and other documentation on dcache locking
-================================================
-
-1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
-
-2. http://lse.sourceforge.net/locking/dcache/dcache.html
-
-
-
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index bc0b9cfe095..983e14abe7e 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -46,3 +46,10 @@ data server cache
 file driver devices refer to data servers, which are kept in a module
 level cache.  Its reference is held over the lifetime of the deviceid
 pointing to it.
+
+lseg
+----
+lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
+bit which holds it in the pnfs_layout_hdr's list.  When the final lseg
+is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
+bit is set, preventing any new lsegs from being added.
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index ac2a261c5f7..933bc66ccff 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -457,6 +457,11 @@ ChangeLog
 
 Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
 
+2.1.30:
+	- Fix writev() (it kept writing the first segment over and over again
+	  instead of moving onto subsequent segments).
+	- Fix crash in ntfs_mft_record_alloc() when mapping the new extent mft
+	  record failed.
 2.1.29:
 	- Fix a deadlock when mounting read-write.
 2.1.28:
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt
new file mode 100644
index 00000000000..eb59c8b44be
--- /dev/null
+++ b/Documentation/filesystems/path-lookup.txt
@@ -0,0 +1,382 @@
+Path walking and name lookup locking
+====================================
+
+Path resolution is the finding a dentry corresponding to a path name string, by
+performing a path walk. Typically, for every open(), stat() etc., the path name
+will be resolved. Paths are resolved by walking the namespace tree, starting
+with the first component of the pathname (eg. root or cwd) with a known dentry,
+then finding the child of that dentry, which is named the next component in the
+path string. Then repeating the lookup from the child dentry and finding its
+child with the next element, and so on.
+
+Since it is a frequent operation for workloads like multiuser environments and
+web servers, it is important to optimize this code.
+
+Path walking synchronisation history:
+Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and
+thus in every component during path look-up. Since 2.5.10 onwards, fast-walk
+algorithm changed this by holding the dcache_lock at the beginning and walking
+as many cached path component dentries as possible. This significantly
+decreases the number of acquisition of dcache_lock. However it also increases
+the lock hold time significantly and affects performance in large SMP machines.
+Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to
+make dcache look-up lock-free.
+
+All the above algorithms required taking a lock and reference count on the
+dentry that was looked up, so that may be used as the basis for walking the
+next path element. This is inefficient and unscalable. It is inefficient
+because of the locks and atomic operations required for every dentry element
+slows things down. It is not scalable because many parallel applications that
+are path-walk intensive tend to do path lookups starting from a common dentry
+(usually, the root "/" or current working directory). So contention on these
+common path elements causes lock and cacheline queueing.
+
+Since 2.6.38, RCU is used to make a significant part of the entire path walk
+(including dcache look-up) completely "store-free" (so, no locks, atomics, or
+even stores into cachelines of common dentries). This is known as "rcu-walk"
+path walking.
+
+Path walking overview
+=====================
+
+A name string specifies a start (root directory, cwd, fd-relative) and a
+sequence of elements (directory entry names), which together refer to a path in
+the namespace. A path is represented as a (dentry, vfsmount) tuple. The name
+elements are sub-strings, seperated by '/'.
+
+Name lookups will want to find a particular path that a name string refers to
+(usually the final element, or parent of final element). This is done by taking
+the path given by the name's starting point (which we know in advance -- eg.
+current->fs->cwd or current->fs->root) as the first parent of the lookup. Then
+iteratively for each subsequent name element, look up the child of the current
+parent with the given name and if it is not the desired entry, make it the
+parent for the next lookup.
+
+A parent, of course, must be a directory, and we must have appropriate
+permissions on the parent inode to be able to walk into it.
+
+Turning the child into a parent for the next lookup requires more checks and
+procedures. Symlinks essentially substitute the symlink name for the target
+name in the name string, and require some recursive path walking.  Mount points
+must be followed into (thus changing the vfsmount that subsequent path elements
+refer to), switching from the mount point path to the root of the particular
+mounted vfsmount. These behaviours are variously modified depending on the
+exact path walking flags.
+
+Path walking then must, broadly, do several particular things:
+- find the start point of the walk;
+- perform permissions and validity checks on inodes;
+- perform dcache hash name lookups on (parent, name element) tuples;
+- traverse mount points;
+- traverse symlinks;
+- lookup and create missing parts of the path on demand.
+
+Safe store-free look-up of dcache hash table
+============================================
+
+Dcache name lookup
+------------------
+In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple
+and use that to select a bucket in the dcache-hash table. The list of entries
+in that bucket is then walked, and we do a full comparison of each entry
+against our (parent, name) tuple.
+
+The hash lists are RCU protected, so list walking is not serialised with
+concurrent updates (insertion, deletion from the hash). This is a standard RCU
+list application with the exception of renames, which will be covered below.
+
+Parent and name members of a dentry, as well as its membership in the dcache
+hash, and its inode are protected by the per-dentry d_lock spinlock. A
+reference is taken on the dentry (while the fields are verified under d_lock),
+and this stabilises its d_inode pointer and actual inode. This gives a stable
+point to perform the next step of our path walk against.
+
+These members are also protected by d_seq seqlock, although this offers
+read-only protection and no durability of results, so care must be taken when
+using d_seq for synchronisation (see seqcount based lookups, below).
+
+Renames
+-------
+Back to the rename case. In usual RCU protected lists, the only operations that
+will happen to an object is insertion, and then eventually removal from the
+list. The object will not be reused until an RCU grace period is complete.
+This ensures the RCU list traversal primitives can run over the object without
+problems (see RCU documentation for how this works).
+
+However when a dentry is renamed, its hash value can change, requiring it to be
+moved to a new hash list. Allocating and inserting a new alias would be
+expensive and also problematic for directory dentries. Latency would be far to
+high to wait for a grace period after removing the dentry and before inserting
+it in the new hash bucket. So what is done is to insert the dentry into the
+new list immediately.
+
+However, when the dentry's list pointers are updated to point to objects in the
+new list before waiting for a grace period, this can result in a concurrent RCU
+lookup of the old list veering off into the new (incorrect) list and missing
+the remaining dentries on the list.
+
+There is no fundamental problem with walking down the wrong list, because the
+dentry comparisons will never match. However it is fatal to miss a matching
+dentry. So a seqlock is used to detect when a rename has occurred, and so the
+lookup can be retried.
+
+         1      2      3
+        +---+  +---+  +---+
+hlist-->| N-+->| N-+->| N-+->
+head <--+-P |<-+-P |<-+-P |
+        +---+  +---+  +---+
+
+Rename of dentry 2 may require it deleted from the above list, and inserted
+into a new list. Deleting 2 gives the following list.
+
+         1             3
+        +---+         +---+     (don't worry, the longer pointers do not
+hlist-->| N-+-------->| N-+->    impose a measurable performance overhead
+head <--+-P |<--------+-P |      on modern CPUs)
+        +---+         +---+
+          ^      2      ^
+          |    +---+    |
+          |    | N-+----+
+          +----+-P |
+               +---+
+
+This is a standard RCU-list deletion, which leaves the deleted object's
+pointers intact, so a concurrent list walker that is currently looking at
+object 2 will correctly continue to object 3 when it is time to traverse the
+next object.
+
+However, when inserting object 2 onto a new list, we end up with this:
+
+         1             3
+        +---+         +---+
+hlist-->| N-+-------->| N-+->
+head <--+-P |<--------+-P |
+        +---+         +---+
+                 2
+               +---+
+               | N-+---->
+          <----+-P |
+               +---+
+
+Because we didn't wait for a grace period, there may be a concurrent lookup
+still at 2. Now when it follows 2's 'next' pointer, it will walk off into
+another list without ever having checked object 3.
+
+A related, but distinctly different, issue is that of rename atomicity versus
+lookup operations. If a file is renamed from 'A' to 'B', a lookup must only
+find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup
+of 'B' must succeed (note the reverse is not true).
+
+Between deleting the dentry from the old hash list, and inserting it on the new
+hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same
+rename seqlock is also used to cover this race in much the same way, by
+retrying a negative lookup result if a rename was in progress.
+
+Seqcount based lookups
+----------------------
+In refcount based dcache lookups, d_lock is used to serialise access to
+the dentry, stabilising it while comparing its name and parent and then
+taking a reference count (the reference count then gives a stable place to
+start the next part of the path walk from).
+
+As explained above, we would like to do path walking without taking locks or
+reference counts on intermediate dentries along the path. To do this, a per
+dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry
+looks like (its name, parent, and inode). That snapshot is then used to start
+the next part of the path walk. When loading the coherent snapshot under d_seq,
+care must be taken to load the members up-front, and use those pointers rather
+than reloading from the dentry later on (otherwise we'd have interesting things
+like d_inode going NULL underneath us, if the name was unlinked).
+
+Also important is to avoid performing any destructive operations (pretty much:
+no non-atomic stores to shared data), and to recheck the seqcount when we are
+"done" with the operation. Retry or abort if the seqcount does not match.
+Avoiding destructive or changing operations means we can easily unwind from
+failure.
+
+What this means is that a caller, provided they are holding RCU lock to
+protect the dentry object from disappearing, can perform a seqcount based
+lookup which does not increment the refcount on the dentry or write to
+it in any way. This returned dentry can be used for subsequent operations,
+provided that d_seq is rechecked after that operation is complete.
+
+Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be
+queried for permissions.
+
+With this two parts of the puzzle, we can do path lookups without taking
+locks or refcounts on dentry elements.
+
+RCU-walk path walking design
+============================
+
+Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk
+is the traditional[*] way of performing dcache lookups using d_lock to
+serialise concurrent modifications to the dentry and take a reference count on
+it. ref-walk is simple and obvious, and may sleep, take locks, etc while path
+walking is operating on each dentry. rcu-walk uses seqcount based dentry
+lookups, and can perform lookup of intermediate elements without any stores to
+shared data in the dentry or inode. rcu-walk can not be applied to all cases,
+eg. if the filesystem must sleep or perform non trivial operations, rcu-walk
+must be switched to ref-walk mode.
+
+[*] RCU is still used for the dentry hash lookup in ref-walk, but not the full
+    path walk.
+
+Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining
+path string, rcu-walk uses a d_seq protected snapshot. When looking up a
+child of this parent snapshot, we open d_seq critical section on the child
+before closing d_seq critical section on the parent. This gives an interlocking
+ladder of snapshots to walk down.
+
+
+     proc 101
+      /----------------\
+     / comm:    "vi"    \
+    /  fs.root: dentry0  \
+    \  fs.cwd:  dentry2  /
+     \                  /
+      \----------------/
+
+So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will
+start from current->fs->root, which is a pinned dentry. Alternatively,
+"./test.c" would start from cwd; both names refer to the same path in
+the context of proc101.
+
+     dentry 0
+    +---------------------+   rcu-walk begins here, we note d_seq, check the
+    | name:    "/"        |   inode's permission, and then look up the next
+    | inode:   10         |   path element which is "home"...
+    | children:"home", ...|
+    +---------------------+
+              |
+     dentry 1 V
+    +---------------------+   ... which brings us here. We find dentry1 via
+    | name:    "home"     |   hash lookup, then note d_seq and compare name
+    | inode:   678        |   string and parent pointer. When we have a match,
+    | children:"npiggin"  |   we now recheck the d_seq of dentry0. Then we
+    +---------------------+   check inode and look up the next element.
+              |
+     dentry2  V
+    +---------------------+   Note: if dentry0 is now modified, lookup is
+    | name:    "npiggin"  |   not necessarily invalid, so we need only keep a
+    | inode:   543        |   parent for d_seq verification, and grandparents
+    | children:"a.c", ... |   can be forgotten.
+    +---------------------+
+              |
+     dentry3  V
+    +---------------------+   At this point we have our destination dentry.
+    | name:    "a.c"      |   We now take its d_lock, verify d_seq of this
+    | inode:   14221      |   dentry. If that checks out, we can increment
+    | children:NULL       |   its refcount because we're holding d_lock.
+    +---------------------+
+
+Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock,
+re-checking its d_seq, and then incrementing its refcount is called
+"dropping rcu" or dropping from rcu-walk into ref-walk mode.
+
+It is, in some sense, a bit of a house of cards. If the seqcount check of the
+parent snapshot fails, the house comes down, because we had closed the d_seq
+section on the grandparent, so we have nothing left to stand on. In that case,
+the path walk must be fully restarted (which we do in ref-walk mode, to avoid
+live locks). It is costly to have a full restart, but fortunately they are
+quite rare.
+
+When we reach a point where sleeping is required, or a filesystem callout
+requires ref-walk, then instead of restarting the walk, we attempt to drop rcu
+at the last known good dentry we have. Avoiding a full restart in ref-walk in
+these cases is fundamental for performance and scalability because blocking
+operations such as creates and unlinks are not uncommon.
+
+The detailed design for rcu-walk is like this:
+* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
+* Take the RCU lock for the entire path walk, starting with the acquiring
+  of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
+  not required for dentry persistence.
+* synchronize_rcu is called when unregistering a filesystem, so we can
+  access d_ops and i_ops during rcu-walk.
+* Similarly take the vfsmount lock for the entire path walk. So now mnt
+  refcounts are not required for persistence. Also we are free to perform mount
+  lookups, and to assume dentry mount points and mount roots are stable up and
+  down the path.
+* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
+  so we can load this tuple atomically, and also check whether any of its
+  members have changed.
+* Dentry lookups (based on parent, candidate string tuple) recheck the parent
+  sequence after the child is found in case anything changed in the parent
+  during the path walk.
+* inode is also RCU protected so we can load d_inode and use the inode for
+  limited things.
+* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
+* i_op can be loaded.
+* When the destination dentry is reached, drop rcu there (ie. take d_lock,
+  verify d_seq, increment refcount).
+* If seqlock verification fails anywhere along the path, do a full restart
+  of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of
+  a better errno) to signal an rcu-walk failure.
+
+The cases where rcu-walk cannot continue are:
+* NULL dentry (ie. any uncached path element)
+* Following links
+
+It may be possible eventually to make following links rcu-walk aware.
+
+Uncached path elements will always require dropping to ref-walk mode, at the
+very least because i_mutex needs to be grabbed, and objects allocated.
+
+Final note:
+"store-free" path walking is not strictly store free. We take vfsmount lock
+and refcounts (both of which can be made per-cpu), and we also store to the
+stack (which is essentially CPU-local), and we also have to take locks and
+refcount on final dentry.
+
+The point is that shared data, where practically possible, is not locked
+or stored into. The result is massive improvements in performance and
+scalability of path resolution.
+
+
+Interesting statistics
+======================
+
+The following table gives rcu lookup statistics for a few simple workloads
+(2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to
+drop rcu that fail due to d_seq failure and requiring the entire path lookup
+again. Other cases are successful rcu-drops that are required before the final
+element, nodentry for missing dentry, revalidate for filesystem revalidate
+routine requiring rcu drop, permission for permission check requiring drop,
+and link for symlink traversal requiring drop.
+
+     rcu-lookups     restart  nodentry          link  revalidate  permission
+bootup     47121           0      4624          1010       10283        7852
+dbench  25386793           0   6778659(26.7%)     55         549        1156
+kbuild   2696672          10     64442(2.3%)  108764(4.0%)     1        1590
+git diff   39605           0        28             2           0         106
+vfstest 24185492        4945    708725(2.9%) 1076136(4.4%)     0        2651
+
+What this shows is that failed rcu-walk lookups, ie. ones that are restarted
+entirely with ref-walk, are quite rare. Even the "vfstest" case which
+specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise
+such races is not showing a huge amount of restarts.
+
+Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where
+the reference count needs to be taken for some reason. This is either because
+we have reached the target of the path walk, or because we have encountered a
+condition that can't be resolved in rcu-walk mode.  Ideally, we drop rcu-walk
+only when we have reached the target dentry, so the other statistics show where
+this does not happen.
+
+Note that a graceful drop from rcu-walk mode due to something such as the
+dentry not existing (which can be common) is not necessarily a failure of
+rcu-walk scheme, because some elements of the path may have been walked in
+rcu-walk mode. The further we get from common path elements (such as cwd or
+root), the less contended the dentry is likely to be. The closer we are to
+common path elements, the more likely they will exist in dentry cache.
+
+
+Papers and other documentation on dcache locking
+================================================
+
+1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124).
+
+2. http://lse.sourceforge.net/locking/dcache/dcache.html
+
+
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index b12c8953868..0c986c9e851 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -216,7 +216,6 @@ had ->revalidate()) add calls in ->follow_link()/->readlink().
 ->d_parent changes are not protected by BKL anymore.  Read access is safe
 if at least one of the following is true:
 	* filesystem has no cross-directory rename()
-	* dcache_lock is held
 	* we know that parent had been locked (e.g. we are looking at
 ->d_parent of ->lookup() argument).
 	* we are called from ->rename().
@@ -318,3 +317,87 @@ if it's zero is not *and* *never* *had* *been* enough.  Final unlink() and iput(
 may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly
 free the on-disk inode, you may end up doing that while ->write_inode() is writing
 to it.
+
+---
+[mandatory]
+
+	.d_delete() now only advises the dcache as to whether or not to cache
+unreferenced dentries, and is now only called when the dentry refcount goes to
+0. Even on 0 refcount transition, it must be able to tolerate being called 0,
+1, or more times (eg. constant, idempotent).
+
+---
+[mandatory]
+
+	.d_compare() calling convention and locking rules are significantly
+changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+look at examples of other filesystems) for guidance.
+
+---
+[mandatory]
+
+	.d_hash() calling convention and locking rules are significantly
+changed. Read updated documentation in Documentation/filesystems/vfs.txt (and
+look at examples of other filesystems) for guidance.
+
+---
+[mandatory]
+	dcache_lock is gone, replaced by fine grained locks. See fs/dcache.c
+for details of what locks to replace dcache_lock with in order to protect
+particular things. Most of the time, a filesystem only needs ->d_lock, which
+protects *all* the dcache state of a given dentry.
+
+--
+[mandatory]
+
+	Filesystems must RCU-free their inodes, if they can have been accessed
+via rcu-walk path walk (basically, if the file can have had a path name in the
+vfs namespace).
+
+	i_dentry and i_rcu share storage in a union, and the vfs expects
+i_dentry to be reinitialized before it is freed, so an:
+
+  INIT_LIST_HEAD(&inode->i_dentry);
+
+must be done in the RCU callback.
+
+--
+[recommended]
+	vfs now tries to do path walking in "rcu-walk mode", which avoids
+atomic operations and scalability hazards on dentries and inodes (see
+Documentation/filesystems/path-lookup.txt). d_hash and d_compare changes
+(above) are examples of the changes required to support this. For more complex
+filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so
+no changes are required to the filesystem. However, this is costly and loses
+the benefits of rcu-walk mode. We will begin to add filesystem callbacks that
+are rcu-walk aware, shown below. Filesystems should take advantage of this
+where possible.
+
+--
+[mandatory]
+	d_revalidate is a callback that is made on every path element (if
+the filesystem provides it), which requires dropping out of rcu-walk mode. This
+may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be
+returned if the filesystem cannot handle rcu-walk. See
+Documentation/filesystems/vfs.txt for more details.
+
+	permission and check_acl are inode permission checks that are called
+on many or all directory inodes on the way down a path walk (to check for
+exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU).
+See Documentation/filesystems/vfs.txt for more details.
+ 
+--
+[mandatory]
+	In ->fallocate() you must check the mode option passed in.  If your
+filesystem does not support hole punching (deallocating space in the middle of a
+file) you must return -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set in mode.
+Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set,
+so the i_size should not change when hole punching, even when puching the end of
+a file off.
+
+--
+[mandatory]
+	->get_sb() is gone.  Switch to use of ->mount().  Typically it's just
+a matter of switching from calling get_sb_... to mount_... and changing the
+function type.  If you were doing it manually, just switch from setting ->mnt_root
+to some pointer to returning that pointer.  On errors return ERR_PTR(...).
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e73df2722ff..23cae6548d3 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,7 @@ Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
+Locked:              374 kB
 
 The first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options.  The following is from a
 
 > cat /proc/meminfo
 
+The "Locked" indicates whether the mapping is locked in memory or not.
+
 
 MemTotal:     16344972 kB
 MemFree:      13634064 kB
@@ -1181,6 +1184,30 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
  mb_groups       details of multiblock allocator buddy cache of free blocks
 ..............................................................................
 
+2.0 /proc/consoles
+------------------
+Shows registered system console lines.
+
+To see which character device lines are currently used for the system console
+/dev/console, you may simply look into the file /proc/consoles:
+
+  > cat /proc/consoles
+  tty0                 -WU (ECp)       4:7
+  ttyS0                -W- (Ep)        4:64
+
+The columns are:
+
+  device               name of the device
+  operations           R = can do read operations
+                       W = can do write operations
+                       U = can do unblank
+  flags                E = it is enabled
+                       C = it is prefered console
+                       B = it is primary boot console
+                       p = it is used for printk buffer
+                       b = it is not a TTY but a Braille device
+                       a = it is safe to use when cpu is offline
+  major:minor          major and minor number of the device separated by a colon
 
 ------------------------------------------------------------------------------
 Summary
@@ -1296,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
+
 NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
 Documentation/feature-removal-schedule.txt.
 
diff --git a/Documentation/filesystems/romfs.txt b/Documentation/filesystems/romfs.txt
index 2d2a7b2a16b..e2b07cc9120 100644
--- a/Documentation/filesystems/romfs.txt
+++ b/Documentation/filesystems/romfs.txt
@@ -17,8 +17,7 @@ comparison, an actual rescue disk used up 3202 blocks with ext2, while
 with romfs, it needed 3079 blocks.
 
 To create such a file system, you'll need a user program named
-genromfs.  It is available via anonymous ftp on sunsite.unc.edu and
-its mirrors, in the /pub/Linux/system/recovery/ directory.
+genromfs. It is available on http://romfs.sourceforge.net/
 
 As the name suggests, romfs could be also used (space-efficiently) on
 various read-only media, like (E)EPROM disks if someone will have the
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 5d1335faec2..f806e50aaa6 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -39,10 +39,12 @@ userspace. Top-level directories in sysfs represent the common
 ancestors of object hierarchies; i.e. the subsystems the objects
 belong to. 
 
-Sysfs internally stores the kobject that owns the directory in the
-->d_fsdata pointer of the directory's dentry. This allows sysfs to do
-reference counting directly on the kobject when the file is opened and
-closed. 
+Sysfs internally stores a pointer to the kobject that implements a
+directory in the sysfs_dirent object associated with the directory. In
+the past this kobject pointer has been used by sysfs to do reference
+counting directly on the kobject whenever the file is opened or closed.
+With the current sysfs implementation the kobject reference count is
+only modified directly by the function sysfs_schedule_callback().
 
 
 Attributes
@@ -208,9 +210,9 @@ Other notes:
   is 4096. 
 
 - show() methods should return the number of bytes printed into the
-  buffer. This is the return value of snprintf().
+  buffer. This is the return value of scnprintf().
 
-- show() should always use snprintf(). 
+- show() should always use scnprintf().
 
 - store() should return the number of bytes used from the buffer. If the
   entire buffer has been used, just return the count argument.
@@ -229,7 +231,7 @@ A very simple (and naive) implementation of a device attribute is:
 static ssize_t show_name(struct device *dev, struct device_attribute *attr,
                          char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%s\n", dev->name);
+	return scnprintf(buf, PAGE_SIZE, "%s\n", dev->name);
 }
 
 static ssize_t store_name(struct device *dev, struct device_attribute *attr,
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 20899e095e7..ef0714aa8e4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -95,10 +95,11 @@ functions:
    extern int unregister_filesystem(struct file_system_type *);
 
 The passed struct file_system_type describes your filesystem. When a
-request is made to mount a device onto a directory in your filespace,
-the VFS will call the appropriate get_sb() method for the specific
-filesystem. The dentry for the mount point will then be updated to
-point to the root inode for the new filesystem.
+request is made to mount a filesystem onto a directory in your namespace,
+the VFS will call the appropriate mount() method for the specific
+filesystem.  New vfsmount refering to the tree returned by ->mount()
+will be attached to the mountpoint, so that when pathname resolution
+reaches the mountpoint it will jump into the root of that vfsmount.
 
 You can see all filesystems that are registered to the kernel in the
 file /proc/filesystems.
@@ -107,14 +108,14 @@ file /proc/filesystems.
 struct file_system_type
 -----------------------
 
-This describes the filesystem. As of kernel 2.6.22, the following
+This describes the filesystem. As of kernel 2.6.39, the following
 members are defined:
 
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-        int (*get_sb) (struct file_system_type *, int,
-                       const char *, void *, struct vfsmount *);
+        struct dentry (*mount) (struct file_system_type *, int,
+                       const char *, void *);
         void (*kill_sb) (struct super_block *);
         struct module *owner;
         struct file_system_type * next;
@@ -128,11 +129,11 @@ struct file_system_type {
 
   fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.)
 
-  get_sb: the method to call when a new instance of this
+  mount: the method to call when a new instance of this
 	filesystem should be mounted
 
   kill_sb: the method to call when an instance of this filesystem
-	should be unmounted
+	should be shut down
 
   owner: for internal VFS use: you should initialize this to THIS_MODULE in
   	most cases.
@@ -141,7 +142,7 @@ struct file_system_type {
 
   s_lock_key, s_umount_key: lockdep-specific
 
-The get_sb() method has the following arguments:
+The mount() method has the following arguments:
 
   struct file_system_type *fs_type: describes the filesystem, partly initialized
   	by the specific filesystem code
@@ -153,32 +154,39 @@ The get_sb() method has the following arguments:
   void *data: arbitrary mount options, usually comes as an ASCII
 	string (see "Mount Options" section)
 
-  struct vfsmount *mnt: a vfs-internal representation of a mount point
+The mount() method must return the root dentry of the tree requested by
+caller.  An active reference to its superblock must be grabbed and the
+superblock must be locked.  On failure it should return ERR_PTR(error).
 
-The get_sb() method must determine if the block device specified
-in the dev_name and fs_type contains a filesystem of the type the method
-supports. If it succeeds in opening the named block device, it initializes a
-struct super_block descriptor for the filesystem contained by the block device.
-On failure it returns an error.
+The arguments match those of mount(2) and their interpretation
+depends on filesystem type.  E.g. for block filesystems, dev_name is
+interpreted as block device name, that device is opened and if it
+contains a suitable filesystem image the method creates and initializes
+struct super_block accordingly, returning its root dentry to caller.
+
+->mount() may choose to return a subtree of existing filesystem - it
+doesn't have to create a new one.  The main result from the caller's
+point of view is a reference to dentry at the root of (sub)tree to
+be attached; creation of new superblock is a common side effect.
 
 The most interesting member of the superblock structure that the
-get_sb() method fills in is the "s_op" field. This is a pointer to
+mount() method fills in is the "s_op" field. This is a pointer to
 a "struct super_operations" which describes the next level of the
 filesystem implementation.
 
-Usually, a filesystem uses one of the generic get_sb() implementations
-and provides a fill_super() method instead. The generic methods are:
+Usually, a filesystem uses one of the generic mount() implementations
+and provides a fill_super() callback instead. The generic variants are:
 
-  get_sb_bdev: mount a filesystem residing on a block device
+  mount_bdev: mount a filesystem residing on a block device
 
-  get_sb_nodev: mount a filesystem that is not backed by a device
+  mount_nodev: mount a filesystem that is not backed by a device
 
-  get_sb_single: mount a filesystem which shares the instance between
+  mount_single: mount a filesystem which shares the instance between
   	all mounts
 
-A fill_super() method implementation has the following arguments:
+A fill_super() callback implementation has the following arguments:
 
-  struct super_block *sb: the superblock structure. The method fill_super()
+  struct super_block *sb: the superblock structure. The callback
   	must initialize this properly.
 
   void *data: arbitrary mount options, usually comes as an ASCII
@@ -325,7 +333,8 @@ struct inode_operations {
         void * (*follow_link) (struct dentry *, struct nameidata *);
         void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int, struct nameidata *);
+	int (*permission) (struct inode *, int, unsigned int);
+	int (*check_acl)(struct inode *, int, unsigned int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -414,6 +423,13 @@ otherwise noted.
   permission: called by the VFS to check for access rights on a POSIX-like
   	filesystem.
 
+	May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk
+        mode, the filesystem must check the permission without blocking or
+	storing to the inode.
+
+	If a situation is encountered that rcu-walk cannot handle, return
+	-ECHILD and it will be called again in ref-walk mode.
+
   setattr: called by the VFS to set attributes for a file. This method
   	is called by chmod(2) and related system calls.
 
@@ -847,12 +863,17 @@ defined:
 
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, struct nameidata *);
-	int (*d_hash) (struct dentry *, struct qstr *);
-	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
-	int (*d_delete)(struct dentry *);
+	int (*d_hash)(const struct dentry *, const struct inode *,
+			struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct inode *,
+			const struct dentry *, const struct inode *,
+			unsigned int, const char *, const struct qstr *);
+	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
+	struct vfsmount *(*d_automount)(struct path *);
+	int (*d_manage)(struct dentry *, bool, bool);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -860,13 +881,45 @@ struct dentry_operations {
 	dcache. Most filesystems leave this as NULL, because all their
 	dentries in the dcache are valid
 
-  d_hash: called when the VFS adds a dentry to the hash table
+	d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU).
+	If in rcu-walk mode, the filesystem must revalidate the dentry without
+	blocking or storing to the dentry, d_parent and d_inode should not be
+	used without care (because they can go NULL), instead nd->inode should
+	be used.
+
+	If a situation is encountered that rcu-walk cannot handle, return
+	-ECHILD and it will be called again in ref-walk mode.
+
+  d_hash: called when the VFS adds a dentry to the hash table. The first
+	dentry passed to d_hash is the parent directory that the name is
+	to be hashed into. The inode is the dentry's inode.
 
-  d_compare: called when a dentry should be compared with another
+	Same locking and synchronisation rules as d_compare regarding
+	what is safe to dereference etc.
 
-  d_delete: called when the last reference to a dentry is
-	deleted. This means no-one is using the dentry, however it is
-	still valid and in the dcache
+  d_compare: called to compare a dentry name with a given name. The first
+	dentry is the parent of the dentry to be compared, the second is
+	the parent's inode, then the dentry and inode (may be NULL) of the
+	child dentry. len and name string are properties of the dentry to be
+	compared. qstr is the name to compare it with.
+
+	Must be constant and idempotent, and should not take locks if
+	possible, and should not or store into the dentry or inodes.
+	Should not dereference pointers outside the dentry or inodes without
+	lots of care (eg.  d_parent, d_inode, d_name should not be used).
+
+	However, our vfsmount is pinned, and RCU held, so the dentries and
+	inodes won't disappear, neither will our sb or filesystem module.
+	->i_sb and ->d_sb may be used.
+
+	It is a tricky calling convention because it needs to be called under
+	"rcu-walk", ie. without any locks or references on things.
+
+  d_delete: called when the last reference to a dentry is dropped and the
+	dcache is deciding whether or not to cache it. Return 1 to delete
+	immediately, or 0 to cache the dentry. Default is NULL which means to
+	always cache a reachable dentry. d_delete must be constant and
+	idempotent.
 
   d_release: called when a dentry is really deallocated
 
@@ -887,6 +940,47 @@ struct dentry_operations {
 	at the end of the buffer, and returns a pointer to the first char.
 	dynamic_dname() helper function is provided to take care of this.
 
+  d_automount: called when an automount dentry is to be traversed (optional).
+	This should create a new VFS mount record and return the record to the
+	caller.  The caller is supplied with a path parameter giving the
+	automount directory to describe the automount target and the parent
+	VFS mount record to provide inheritable mount parameters.  NULL should
+	be returned if someone else managed to make the automount first.  If
+	the vfsmount creation failed, then an error code should be returned.
+	If -EISDIR is returned, then the directory will be treated as an
+	ordinary directory and returned to pathwalk to continue walking.
+
+	If a vfsmount is returned, the caller will attempt to mount it on the
+	mountpoint and will remove the vfsmount from its expiration list in
+	the case of failure.  The vfsmount should be returned with 2 refs on
+	it to prevent automatic expiration - the caller will clean up the
+	additional ref.
+
+	This function is only used if DCACHE_NEED_AUTOMOUNT is set on the
+	dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is set on the
+	inode being added.
+
+  d_manage: called to allow the filesystem to manage the transition from a
+	dentry (optional).  This allows autofs, for example, to hold up clients
+	waiting to explore behind a 'mountpoint' whilst letting the daemon go
+	past and construct the subtree there.  0 should be returned to let the
+	calling process continue.  -EISDIR can be returned to tell pathwalk to
+	use this directory as an ordinary directory and to ignore anything
+	mounted on it and not to check the automount flag.  Any other error
+	code will abort pathwalk completely.
+
+	If the 'mounting_here' parameter is true, then namespace_sem is being
+	held by the caller and the function should not initiate any mounts or
+	unmounts that it will then wait for.
+
+	If the 'rcu_walk' parameter is true, then the caller is doing a
+	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
+	and the caller can be asked to leave it and call again by returing
+	-ECHILD.
+
+	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
+	dentry being transited from.
+
 Example :
 
 static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
@@ -910,14 +1004,11 @@ manipulate dentries:
 	the usage count)
 
   dput: close a handle for a dentry (decrements the usage count). If
-	the usage count drops to 0, the "d_delete" method is called
-	and the dentry is placed on the unused list if the dentry is
-	still in its parents hash list. Putting the dentry on the
-	unused list just means that if the system needs some RAM, it
-	goes through the unused list of dentries and deallocates them.
-	If the dentry has already been unhashed and the usage count
-	drops to 0, in this case the dentry is deallocated after the
-	"d_delete" method is called
+	the usage count drops to 0, and the dentry is still in its
+	parent's hash, the "d_delete" method is called to check whether
+	it should be cached. If it should not be cached, or if the dentry
+	is not hashed, it is deleted. Otherwise cached dentries are put
+	into an LRU list to be reclaimed on memory shortage.
 
   d_drop: this unhashes a dentry from its parents hash list. A
 	subsequent call to dput() will deallocate the dentry if its