summaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/.gitignore1
-rw-r--r--Documentation/filesystems/Locking19
-rw-r--r--Documentation/filesystems/Makefile3
-rw-r--r--Documentation/filesystems/autofs4.txt520
-rw-r--r--Documentation/filesystems/cifs/AUTHORS1
-rw-r--r--Documentation/filesystems/cifs/TODO97
-rw-r--r--Documentation/filesystems/f2fs.txt13
-rw-r--r--Documentation/filesystems/nfs/Exporting38
-rw-r--r--Documentation/filesystems/nfs/nfs-rdma.txt16
-rw-r--r--Documentation/filesystems/ntfs.txt268
-rw-r--r--Documentation/filesystems/overlayfs.txt198
-rw-r--r--Documentation/filesystems/seq_file.txt33
-rw-r--r--Documentation/filesystems/vfs.txt19
13 files changed, 859 insertions, 367 deletions
diff --git a/Documentation/filesystems/.gitignore b/Documentation/filesystems/.gitignore
new file mode 100644
index 00000000000..31d6e426b6d
--- /dev/null
+++ b/Documentation/filesystems/.gitignore
@@ -0,0 +1 @@
+dnotify_test
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b18dd177902..b30753cbf43 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -67,6 +67,7 @@ prototypes:
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+ int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
locking rules:
all may block
@@ -96,6 +97,7 @@ fiemap: no
update_time: no
atomic_open: yes
tmpfile: no
+dentry_open: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
@@ -349,7 +351,11 @@ prototypes:
locking rules:
inode->i_lock may block
fl_copy_lock: yes no
-fl_release_private: maybe no
+fl_release_private: maybe maybe[1]
+
+[1]: ->fl_release_private for flock or POSIX locks is currently allowed
+to block. Leases however can still be freed while the i_lock is held and
+so fl_release_private called on a lease should not block.
----------------------- lock_manager_operations ---------------------------
prototypes:
@@ -460,15 +466,12 @@ prototypes:
size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
- int (*setlease)(struct file *, long, struct file_lock **);
+ int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *, int, loff_t, loff_t);
};
locking rules:
- All may block except for ->setlease.
- No VFS locks held on entry except for ->setlease.
-
-->setlease has the file_list_lock held and must not sleep.
+ All may block.
->llseek() locking has moved from llseek to the individual llseek
implementations. If your fs is not using generic_file_llseek, you
@@ -492,6 +495,10 @@ components. And there are other reasons why the current interface is a mess...
->read on directories probably must go away - we should just enforce -EISDIR
in sys_read() and friends.
+->setlease operations should call generic_setlease() before or after setting
+the lease within the individual filesystem to record the result of the
+operation
+
--------------------------- dquot_operations -------------------------------
prototypes:
int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
index a5dd114da14..13483d192eb 100644
--- a/Documentation/filesystems/Makefile
+++ b/Documentation/filesystems/Makefile
@@ -1,5 +1,4 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
+subdir-y := configfs
# List of programs to build
hostprogs-y := dnotify_test
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
new file mode 100644
index 00000000000..39d02e19fb6
--- /dev/null
+++ b/Documentation/filesystems/autofs4.txt
@@ -0,0 +1,520 @@
+<head>
+<style> p { max-width:50em} ol, ul {max-width: 40em}</style>
+</head>
+
+autofs - how it works
+=====================
+
+Purpose
+-------
+
+The goal of autofs is to provide on-demand mounting and race free
+automatic unmounting of various other filesystems. This provides two
+key advantages:
+
+1. There is no need to delay boot until all filesystems that
+ might be needed are mounted. Processes that try to access those
+ slow filesystems might be delayed but other processes can
+ continue freely. This is particularly important for
+ network filesystems (e.g. NFS) or filesystems stored on
+ media with a media-changing robot.
+
+2. The names and locations of filesystems can be stored in
+ a remote database and can change at any time. The content
+ in that data base at the time of access will be used to provide
+ a target for the access. The interpretation of names in the
+ filesystem can even be programmatic rather than database-backed,
+ allowing wildcards for example, and can vary based on the user who
+ first accessed a name.
+
+Context
+-------
+
+The "autofs4" filesystem module is only one part of an autofs system.
+There also needs to be a user-space program which looks up names
+and mounts filesystems. This will often be the "automount" program,
+though other tools including "systemd" can make use of "autofs4".
+This document describes only the kernel module and the interactions
+required with any user-space program. Subsequent text refers to this
+as the "automount daemon" or simply "the daemon".
+
+"autofs4" is a Linux kernel module with provides the "autofs"
+filesystem type. Several "autofs" filesystems can be mounted and they
+can each be managed separately, or all managed by the same daemon.
+
+Content
+-------
+
+An autofs filesystem can contain 3 sorts of objects: directories,
+symbolic links and mount traps. Mount traps are directories with
+extra properties as described in the next section.
+
+Objects can only be created by the automount daemon: symlinks are
+created with a regular `symlink` system call, while directories and
+mount traps are created with `mkdir`. The determination of whether a
+directory should be a mount trap or not is quite _ad hoc_, largely for
+historical reasons, and is determined in part by the
+*direct*/*indirect*/*offset* mount options, and the *maxproto* mount option.
+
+If neither the *direct* or *offset* mount options are given (so the
+mount is considered to be *indirect*), then the root directory is
+always a regular directory, otherwise it is a mount trap when it is
+empty and a regular directory when not empty. Note that *direct* and
+*offset* are treated identically so a concise summary is that the root
+directory is a mount trap only if the filesystem is mounted *direct*
+and the root is empty.
+
+Directories created in the root directory are mount traps only if the
+filesystem is mounted *indirect* and they are empty.
+
+Directories further down the tree depend on the *maxproto* mount
+option and particularly whether it is less than five or not.
+When *maxproto* is five, no directories further down the
+tree are ever mount traps, they are always regular directories. When
+the *maxproto* is four (or three), these directories are mount traps
+precisely when they are empty.
+
+So: non-empty (i.e. non-leaf) directories are never mount traps. Empty
+directories are sometimes mount traps, and sometimes not depending on
+where in the tree they are (root, top level, or lower), the *maxproto*,
+and whether the mount was *indirect* or not.
+
+Mount Traps
+---------------
+
+A core element of the implementation of autofs is the Mount Traps
+which are provided by the Linux VFS. Any directory provided by a
+filesystem can be designated as a trap. This involves two separate
+features that work together to allow autofs to do its job.
+
+**DCACHE_NEED_AUTOMOUNT**
+
+If a dentry has the DCACHE_NEED_AUTOMOUNT flag set (which gets set if
+the inode has S_AUTOMOUNT set, or can be set directly) then it is
+(potentially) a mount trap. Any access to this directory beyond a
+"`stat`" will (normally) cause the `d_op->d_automount()` dentry operation
+to be called. The task of this method is to find the filesystem that
+should be mounted on the directory and to return it. The VFS is
+responsible for actually mounting the root of this filesystem on the
+directory.
+
+autofs doesn't find the filesystem itself but sends a message to the
+automount daemon asking it to find and mount the filesystem. The
+autofs `d_automount` method then waits for the daemon to report that
+everything is ready. It will then return "`NULL`" indicating that the
+mount has already happened. The VFS doesn't try to mount anything but
+follows down the mount that is already there.
+
+This functionality is sufficient for some users of mount traps such
+as NFS which creates traps so that mountpoints on the server can be
+reflected on the client. However it is not sufficient for autofs. As
+mounting onto a directory is considered to be "beyond a `stat`", the
+automount daemon would not be able to mount a filesystem on the 'trap'
+directory without some way to avoid getting caught in the trap. For
+that purpose there is another flag.
+
+**DCACHE_MANAGE_TRANSIT**
+
+If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but
+related behaviors are invoked, both using the `d_op->d_manage()`
+dentry operation.
+
+Firstly, before checking to see if any filesystem is mounted on the
+directory, d_manage() will be called with the `rcu_walk` parameter set
+to `false`. It may return one of three things:
+
+- A return value of zero indicates that there is nothing special
+ about this dentry and normal checks for mounts and automounts
+ should proceed.
+
+ autofs normally returns zero, but first waits for any
+ expiry (automatic unmounting of the mounted filesystem) to
+ complete. This avoids races.
+
+- A return value of `-EISDIR` tells the VFS to ignore any mounts
+ on the directory and to not consider calling `->d_automount()`.
+ This effectively disables the **DCACHE_NEED_AUTOMOUNT** flag
+ causing the directory not be a mount trap after all.
+
+ autofs returns this if it detects that the process performing the
+ lookup is the automount daemon and that the mount has been
+ requested but has not yet completed. How it determines this is
+ discussed later. This allows the automount daemon not to get
+ caught in the mount trap.
+
+ There is a subtlety here. It is possible that a second autofs
+ filesystem can be mounted below the first and for both of them to
+ be managed by the same daemon. For the daemon to be able to mount
+ something on the second it must be able to "walk" down past the
+ first. This means that d_manage cannot *always* return -EISDIR for
+ the automount daemon. It must only return it when a mount has
+ been requested, but has not yet completed.
+
+ `d_manage` also returns `-EISDIR` if the dentry shouldn't be a
+ mount trap, either because it is a symbolic link or because it is
+ not empty.
+
+- Any other negative value is treated as an error and returned
+ to the caller.
+
+ autofs can return
+
+ - -ENOENT if the automount daemon failed to mount anything,
+ - -ENOMEM if it ran out of memory,
+ - -EINTR if a signal arrived while waiting for expiry to
+ complete
+ - or any other error sent down by the automount daemon.
+
+
+The second use case only occurs during an "RCU-walk" and so `rcu_walk`
+will be set.
+
+An RCU-walk is a fast and lightweight process for walking down a
+filename path (i.e. it is like running on tip-toes). RCU-walk cannot
+cope with all situations so when it finds a difficulty it falls back
+to "REF-walk", which is slower but more robust.
+
+RCU-walk will never call `->d_automount`; the filesystems must already
+be mounted or RCU-walk cannot handle the path.
+To determine if a mount-trap is safe for RCU-walk mode it calls
+`->d_manage()` with `rcu_walk` set to `true`.
+
+In this case `d_manage()` must avoid blocking and should avoid taking
+spinlocks if at all possible. Its sole purpose is to determine if it
+would be safe to follow down into any mounted directory and the only
+reason that it might not be is if an expiry of the mount is
+underway.
+
+In the `rcu_walk` case, `d_manage()` cannot return -EISDIR to tell the
+VFS that this is a directory that doesn't require d_automount. If
+`rcu_walk` sees a dentry with DCACHE_NEED_AUTOMOUNT set but nothing
+mounted, it *will* fall back to REF-walk. `d_manage()` cannot make the
+VFS remain in RCU-walk mode, but can only tell it to get out of
+RCU-walk mode by returning `-ECHILD`.
+
+So `d_manage()`, when called with `rcu_walk` set, should either return
+-ECHILD if there is any reason to believe it is unsafe to end the
+mounted filesystem, and otherwise should return 0.
+
+autofs will return `-ECHILD` if an expiry of the filesystem has been
+initiated or is being considered, otherwise it returns 0.
+
+
+Mountpoint expiry
+-----------------
+
+The VFS has a mechansim for automatically expiring unused mounts,
+much as it can expire any unused dentry information from the dcache.
+This is guided by the MNT_SHRINKABLE flag. This only applies to
+mounts that were created by `d_automount()` returning a filesystem to be
+mounted. As autofs doesn't return such a filesystem but leaves the
+mounting to the automount daemon, it must involve the automount daemon
+in unmounting as well. This also means that autofs has more control
+of expiry.
+
+The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to
+the `umount` system call. Unmounting with MNT_EXPIRE will fail unless
+a previous attempt had been made, and the filesystem has been inactive
+and untouched since that previous attempt. autofs4 does not depend on
+this but has its own internal tracking of whether filesystems were
+recently used. This allows individual names in the autofs directory
+to expire separately.
+
+With version 4 of the protocol, the automount daemon can try to
+unmount any filesystems mounted on the autofs filesystem or remove any
+symbolic links or empty directories any time it likes. If the unmount
+or removal is successful the filesystem will be returned to the state
+it was before the mount or creation, so that any access of the name
+will trigger normal auto-mount processing. In particlar, `rmdir` and
+`unlink` do not leave negative entries in the dcache as a normal
+filesystem would, so an attempt to access a recently-removed object is
+passed to autofs for handling.
+
+With version 5, this is not safe except for unmounting from top-level
+directories. As lower-level directories are never mount traps, other
+processes will see an empty directory as soon as the filesystem is
+unmounted. So it is generally safest to use the autofs expiry
+protocol described below.
+
+Normally the daemon only wants to remove entries which haven't been
+used for a while. For this purpose autofs maintains a "`last_used`"
+time stamp on each directory or symlink. For symlinks it genuinely
+does record the last time the symlink was "used" or followed to find
+out where it points to. For directories the field is a slight
+misnomer. It actually records the last time that autofs checked if
+the directory or one of its descendents was busy and found that it
+was. This is just as useful and doesn't require updating the field so
+often.
+
+The daemon is able to ask autofs if anything is due to be expired,
+using an `ioctl` as discussed later. For a *direct* mount, autofs
+considers if the entire mount-tree can be unmounted or not. For an
+*indirect* mount, autofs considers each of the names in the top level
+directory to determine if any of those can be unmounted and cleaned
+up.
+
+There is an option with indirect mounts to consider each of the leaves
+that has been mounted on instead of considering the top-level names.
+This is intended for compatability with version 4 of autofs and should
+be considered as deprecated.
+
+When autofs considers a directory it checks the `last_used` time and
+compares it with the "timeout" value set when the filesystem was
+mounted, though this check is ignored in some cases. It also checks if
+the directory or anything below it is in use. For symbolic links,
+only the `last_used` time is ever considered.
+
+If both appear to support expiring the directory or symlink, an action
+is taken.
+
+There are two ways to ask autofs to consider expiry. The first is to
+use the **AUTOFS_IOC_EXPIRE** ioctl. This only works for indirect
+mounts. If it finds something in the root directory to expire it will
+return the name of that thing. Once a name has been returned the
+automount daemon needs to unmount any filesystems mounted below the
+name normally. As described above, this is unsafe for non-toplevel
+mounts in a version-5 autofs. For this reason the current `automountd`
+does not use this ioctl.
+
+The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or
+the **AUTOFS_IOC_EXPIRE_MULTI** ioctl. This will work for both direct and
+indirect mounts. If it selects an object to expire, it will notify
+the daemon using the notification mechanism described below. This
+will block until the daemon acknowledges the expiry notification.
+This implies that the "`EXPIRE`" ioctl must be sent from a different
+thread than the one which handles notification.
+
+While the ioctl is blocking, the entry is marked as "expiring" and
+`d_manage` will block until the daemon affirms that the unmount has
+completed (together with removing any directories that might have been
+necessary), or has been aborted.
+
+Communicating with autofs: detecting the daemon
+-----------------------------------------------
+
+There are several forms of communication between the automount daemon
+and the filesystem. As we have already seen, the daemon can create and
+remove directories and symlinks using normal filesystem operations.
+autofs knows whether a process requesting some operation is the daemon
+or not based on its process-group id number (see getpgid(1)).
+
+When an autofs filesystem it mounted the pgid of the mounting
+processes is recorded unless the "pgrp=" option is given, in which
+case that number is recorded instead. Any request arriving from a
+process in that process group is considered to come from the daemon.
+If the daemon ever has to be stopped and restarted a new pgid can be
+provided through an ioctl as will be described below.
+
+Communicating with autofs: the event pipe
+-----------------------------------------
+
+When an autofs filesystem is mounted, the 'write' end of a pipe must
+be passed using the 'fd=' mount option. autofs will write
+notification messages to this pipe for the daemon to respond to.
+For version 5, the format of the message is:
+
+ struct autofs_v5_packet {
+ int proto_version; /* Protocol version */
+ int type; /* Type of packet */
+ autofs_wqt_t wait_queue_token;
+ __u32 dev;
+ __u64 ino;
+ __u32 uid;
+ __u32 gid;
+ __u32 pid;
+ __u32 tgid;
+ __u32 len;
+ char name[NAME_MAX+1];
+ };
+
+where the type is one of
+
+ autofs_ptype_missing_indirect
+ autofs_ptype_expire_indirect
+ autofs_ptype_missing_direct
+ autofs_ptype_expire_direct
+
+so messages can indicate that a name is missing (something tried to
+access it but it isn't there) or that it has been selected for expiry.
+
+The pipe will be set to "packet mode" (equivalent to passing
+`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
+most one packet, and any unread portion of a packet will be discarded.
+
+The `wait_queue_token` is a unique number which can identify a
+particular request to be acknowledged. When a message is sent over
+the pipe the affected dentry is marked as either "active" or
+"expiring" and other accesses to it block until the message is
+acknowledged using one of the ioctls below and the relevant
+`wait_queue_token`.
+
+Communicating with autofs: root directory ioctls
+------------------------------------------------
+
+The root directory of an autofs filesystem will respond to a number of
+ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN
+capability, or must be the automount daemon.
+
+The available ioctl commands are:
+
+- **AUTOFS_IOC_READY**: a notification has been handled. The argument
+ to the ioctl command is the "wait_queue_token" number
+ corresponding to the notification being acknowledged.
+- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
+ the error code `ENOENT`.
+- **AUTOFS_IOC_CATATONIC**: Causes the autofs to enter "catatonic"
+ mode meaning that it stops sending notifications to the daemon.
+ This mode is also entered if a write to the pipe fails.
+- **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use.
+- **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which
+ is really a version number for the implementation. It is
+ currently 2.
+- **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned
+ long. The value is used to set the timeout for expiry, and
+ the current timeout value is stored back through the pointer.
+- **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if
+ the filesystem could be unmounted. This is only a hint as
+ the situation could change at any instant. This call can be
+ use to avoid a more expensive full unmount attempt.
+- **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is
+ anything suitable to expire. A pointer to a packet:
+
+ struct autofs_packet_expire_multi {
+ int proto_version; /* Protocol version */
+ int type; /* Type of packet */
+ autofs_wqt_t wait_queue_token;
+ int len;
+ char name[NAME_MAX+1];
+ };
+
+ is required. This is filled in with the name of something
+ that can be unmounted or removed. If nothing can be expired,
+ `errno` is set to `EAGAIN`. Even though a `wait_queue_token`
+ is present in the structure, no "wait queue" is established
+ and no acknowledgment is needed.
+- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to
+ **AUTOFS_IOC_EXPIRE** except that it causes notification to be
+ sent to the daemon, and it blocks until the daemon acknowledges.
+ The argument is an integer which can contain two different flags.
+
+ **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored
+ and objects are expired if the are not in use.
+
+ **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level
+ name to expire. This is only safe when *maxproto* is 4.
+
+Communicating with autofs: char-device ioctls
+---------------------------------------------
+
+It is not always possible to open the root of an autofs filesystem,
+particularly a *direct* mounted filesystem. If the automount daemon
+is restarted there is no way for it to regain control of existing
+mounts using any of the above communication channels. To address this
+need there is a "miscellaneous" character device (major 10, minor 235)
+which can be used to communicate directly with the autofs filesystem.
+It requires CAP_SYS_ADMIN for access.
+
+The `ioctl`s that can be used on this device are described in a separate
+document `autofs4-mount-control.txt`, and are summarized briefly here.
+Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:
+
+ struct autofs_dev_ioctl {
+ __u32 ver_major;
+ __u32 ver_minor;
+ __u32 size; /* total size of data passed in
+ * including this struct */
+ __s32 ioctlfd; /* automount command fd */
+
+ __u32 arg1; /* Command parameters */
+ __u32 arg2;
+
+ char path[0];
+ };
+
+For the **OPEN_MOUNT** and **IS_MOUNTPOINT** commands, the target
+filesystem is identified by the `path`. All other commands identify
+the filesystem by the `ioctlfd` which is a file descriptor open on the
+root, and which can be returned by **OPEN_MOUNT**.
+
+The `ver_major` and `ver_minor` are in/out parameters which check that
+the requested version is supported, and report the maximum version
+that the kernel module can support.
+
+Commands are:
+
+- **AUTOFS_DEV_IOCTL_VERSION_CMD**: does nothing, except validate and
+ set version numbers.
+- **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor
+ on the root of an autofs filesystem. The filesystem is identified
+ by name and device number, which is stored in `arg1`. Device
+ numbers for existing filesystems can be found in
+ `/proc/self/mountinfo`.
+- **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`.
+- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
+ catatonic mode, this can provide the write end of a new pipe
+ in `arg1` to re-establish communication with a daemon. The
+ process group of the calling process is used to identify the
+ daemon.
+- **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a
+ name within the filesystem that has been auto-mounted on.
+ arg1 is the dev number of the underlying autofs. On successful
+ return, `arg1` and `arg2` will be the UID and GID of the process
+ which triggered that mount.
+
+- **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a
+ mountpoint of a particular type - see separate documentation for
+ details.
+
+- **AUTOFS_DEV_IOCTL_PROTOVER_CMD**:
+- **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**:
+- **AUTOFS_DEV_IOCTL_READY_CMD**:
+- **AUTOFS_DEV_IOCTL_FAIL_CMD**:
+- **AUTOFS_DEV_IOCTL_CATATONIC_CMD**:
+- **AUTOFS_DEV_IOCTL_TIMEOUT_CMD**:
+- **AUTOFS_DEV_IOCTL_EXPIRE_CMD**:
+- **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**: These all have the same
+ function as the similarly named **AUTOFS_IOC** ioctls, except
+ that **FAIL** can be given an explicit error number in `arg1`
+ instead of assuming `ENOENT`, and this **EXPIRE** command
+ corresponds to **AUTOFS_IOC_EXPIRE_MULTI**.
+
+Catatonic mode
+--------------
+
+As mentioned, an autofs mount can enter "catatonic" mode. This
+happens if a write to the notification pipe fails, or if it is
+explicitly requested by an `ioctl`.
+
+When entering catatonic mode, the pipe is closed and any pending
+notifications are acknowledged with the error `ENOENT`.
+
+Once in catatonic mode attempts to access non-existing names will
+result in `ENOENT` while attempts to access existing directories will
+be treated in the same way as if they came from the daemon, so mount
+traps will not fire.
+
+When the filesystem is mounted a _uid_ and _gid_ can be given which
+set the ownership of directories and symbolic links. When the
+filesystem is in catatonic mode, any process with a matching UID can
+create directories or symlinks in the root directory, but not in other
+directories.
+
+Catatonic mode can only be left via the
+**AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`.
+
+autofs, name spaces, and shared mounts
+--------------------------------------
+
+With bind mounts and name spaces it is possible for an autofs
+filesystem to appear at multiple places in one or more filesystem
+name spaces. For this to work sensibly, the autofs filesystem should
+always be mounted "shared". e.g.
+
+> `mount --make-shared /autofs/mount/point`
+
+The automount daemon is only able to mange a single mount location for
+an autofs filesystem and if mounts on that are not 'shared', other
+locations will not behave as expected. In particular access to those
+other locations will likely result in the `ELOOP` error
+
+> Too many levels of symbolic links
diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS
index ca4a67a0bb1..c98800df677 100644
--- a/Documentation/filesystems/cifs/AUTHORS
+++ b/Documentation/filesystems/cifs/AUTHORS
@@ -40,6 +40,7 @@ Gunter Kukkukk (testing and suggestions for support of old servers)
Igor Mammedov (DFS support)
Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
Scott Lovenberg
+Pavel Shilovsky (for great work adding SMB2 support, and various SMB3 features)
Test case and Bug Report contributors
-------------------------------------
diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO
index 355abcdcda9..066ffddc396 100644
--- a/Documentation/filesystems/cifs/TODO
+++ b/Documentation/filesystems/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.53 May 20, 2008
+Version 2.03 August 1, 2014
A Partial List of Missing Features
==================================
@@ -7,63 +7,49 @@ Contributions are welcome. There are plenty of opportunities
for visible, important contributions to this module. Here
is a partial list of the known problems and missing features:
-a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
-so that these operations can be supported to Windows servers
+a) SMB3 (and SMB3.02) missing optional features:
+ - RDMA
+ - multichannel (started)
+ - directory leases (improved metadata caching)
+ - T10 copy offload (copy chunk is only mechanism supported)
+ - encrypted shares
-b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
-SecurityDescriptors
+b) improved sparse file support
-c) Better pam/winbind integration (e.g. to handle uid mapping
-better)
-
-d) Cleanup now unneeded SessSetup code in
-fs/cifs/connect.c and add back in NTLMSSP code if any servers
-need it
-
-e) fix NTLMv2 signing when two mounts with different users to same
-server.
-
-f) Directory entry caching relies on a 1 second timer, rather than
+c) Directory entry caching relies on a 1 second timer, rather than
using FindNotify or equivalent. - (started)
-g) quota support (needs minor kernel change since quota calls
+d) quota support (needs minor kernel change since quota calls
to make it to network filesystems or deviceless filesystems)
-h) investigate sync behavior (including syncpage) and check
-for proper behavior of intr/nointr
-
-i) improve support for very old servers (OS/2 and Win9x for example)
+e) improve support for very old servers (OS/2 and Win9x for example)
Including support for changing the time remotely (utimes command).
-j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+f) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
extra copy in/out of the socket buffers in some cases.
-k) Better optimize open (and pathbased setfilesize) to reduce the
+g) Better optimize open (and pathbased setfilesize) to reduce the
oplock breaks coming from windows srv. Piggyback identical file
opens on top of each other by incrementing reference count rather
than resending (helps reduce server resource utilization and avoid
spurious oplock breaks).
-l) Improve performance of readpages by sending more than one read
-at a time when 8 pages or more are requested. In conjuntion
-add support for async_cifs_readpages.
-
-m) Add support for storing symlink info to Windows servers
+h) Add support for storing symlink info to Windows servers
in the Extended Attribute format their SFU clients would recognize.
-n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+i) Finish inotify support so kde and gnome file list windows
will autorefresh (partially complete by Asser). Needs minor kernel
vfs change to support removing D_NOTIFY on a file.
-o) Add GUI tool to configure /proc/fs/cifs settings and for display of
+j) Add GUI tool to configure /proc/fs/cifs settings and for display of
the CIFS statistics (started)
-p) implement support for security and trusted categories of xattrs
+k) implement support for security and trusted categories of xattrs
(requires minor protocol extension) to enable better support for SELINUX
-q) Implement O_DIRECT flag on open (already supported on mount)
+l) Implement O_DIRECT flag on open (already supported on mount)
-r) Create UID mapping facility so server UIDs can be mapped on a per
+m) Create UID mapping facility so server UIDs can be mapped on a per
mount or a per server basis to client UIDs or nobody if no mapping
exists. This is helpful when Unix extensions are negotiated to
allow better permission checking when UIDs differ on the server
@@ -71,28 +57,29 @@ and client. Add new protocol request to the CIFS protocol
standard for asking the server for the corresponding name of a
particular uid.
-s) Add support for CIFS Unix and also the newer POSIX extensions to the
-server side for Samba 4.
+n) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+
+o) mount check for unmatched uids
-t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
-need to add ability to set time to server (utimes command)
+p) Add support for new vfs entry point for fallocate
-u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+q) Add tools to take advantage of cifs/smb3 specific ioctls and features
+such as "CopyChunk" (fast server side file copy)
-v) mount check for unmatched uids
+r) encrypted file support
-w) Add support for new vfs entry point for fallocate
+s) improved stats gathering, tools (perhaps integration with nfsometer?)
-x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
-processes can proceed better in parallel (on the server)
+t) allow setting more NTFS/SMB3 file attributes remotely (currently limited to compressed
+file attribute via chflags)
-y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
-restriction of wsize max being 127K)
+u) mount helper GUI (to simplify the various configuration options on mount)
-KNOWN BUGS (updated April 24, 2007)
+
+KNOWN BUGS
====================================
See http://bugzilla.samba.org - search on product "CifsVFS" for
-current bug list.
+current bug list. Also check http://bugzilla.kernel.org (Product = File System, Component = CIFS)
1) existing symbolic links (Windows reparse points) are recognized but
can not be created remotely. They are implemented for Samba and those that
@@ -100,30 +87,18 @@ support the CIFS Unix extensions, although earlier versions of Samba
overly restrict the pathnames.
2) follow_link and readdir code does not follow dfs junctions
but recognizes them
-3) create of new files to FAT partitions on Windows servers can
-succeed but still return access denied (appears to be Windows
-server not cifs client problem) and has not been reproduced recently.
-NTFS partitions do not have this problem.
-4) Unix/POSIX capabilities are reset after reconnection, and affect
-a few fields in the tree connection but we do do not know which
-superblocks to apply these changes to. We should probably walk
-the list of superblocks to set these. Also need to check the
-flags on the second mount to the same share, and see if we
-can do the same trick that NFS does to remount duplicate shares.
Misc testing to do
==================
1) check out max path names and max path name components against various server
types. Try nested symlinks (8 deep). Return max path name in stat -f information
-2) Modify file portion of ltp so it can run against a mounted network
-share and run it against cifs vfs in automated fashion.
+2) Improve xfstest's cifs enablement and adapt xfstests where needed to test
+cifs better
3) Additional performance testing and optimization using iozone and similar -
there are some easy changes that can be done to parallelize sequential writes,
and when signing is disabled to request larger read sizes (larger than
negotiated size) and send larger write sizes to modern servers.
-4) More exhaustively test against less common servers. More testing
-against Windows 9x, Windows ME servers.
-
+4) More exhaustively test against less common servers
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index a2046a7d0a9..2cca5a25ef8 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -192,15 +192,22 @@ Files in /sys/fs/f2fs/<devname>
ipu_policy This parameter controls the policy of in-place
updates in f2fs. There are five policies:
- 0: F2FS_IPU_FORCE, 1: F2FS_IPU_SSR,
- 2: F2FS_IPU_UTIL, 3: F2FS_IPU_SSR_UTIL,
- 4: F2FS_IPU_DISABLE.
+ 0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
+ 0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
+ 0x10: F2FS_IPU_FSYNC.
min_ipu_util This parameter controls the threshold to trigger
in-place-updates. The number indicates percentage
of the filesystem utilization, and used by
F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
+ min_fsync_blocks This parameter controls the threshold to trigger
+ in-place-updates when F2FS_IPU_FSYNC mode is set.
+ The number indicates the number of dirty pages
+ when fsync needs to flush on its call path. If
+ the number is less than this value, it triggers
+ in-place-updates.
+
max_victim_search This parameter controls the number of trials to
find a victim segment when conducting SSR and
cleaning operations. The default value is 4096
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index e543b1a619c..c8f036a9b13 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of
c/ Helper routines to allocate anonymous dentries, and to help attach
loose directory dentries at lookup time. They are:
- d_alloc_anon(inode) will return a dentry for the given inode.
+ d_obtain_alias(inode) will return a dentry for the given inode.
If the inode already has a dentry, one of those is returned.
If it doesn't, a new anonymous (IS_ROOT and
DCACHE_DISCONNECTED) dentry is allocated and attached.
In the case of a directory, care is taken that only one dentry
can ever be attached.
- d_splice_alias(inode, dentry) will make sure that there is a
- dentry with the same name and parent as the given dentry, and
- which refers to the given inode.
- If the inode is a directory and already has a dentry, then that
- dentry is d_moved over the given dentry.
- If the passed dentry gets attached, care is taken that this is
- mutually exclusive to a d_alloc_anon operation.
- If the passed dentry is used, NULL is returned, else the used
- dentry is returned. This corresponds to the calling pattern of
- ->lookup.
-
+ d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
+ will introduce a new dentry into the tree; either the passed-in
+ dentry or a preexisting alias for the given inode (such as an
+ anonymous one created by d_obtain_alias), if appropriate. The two
+ functions differ in their handling of directories with preexisting
+ aliases:
+ d_splice_alias will use any existing IS_ROOT dentry, but it will
+ return -EIO rather than try to move a dentry with a different
+ parent. This is appropriate for local filesystems, which
+ should never see such an alias unless the filesystem is
+ corrupted somehow (for example, if two on-disk directory
+ entries refer to the same directory.)
+ d_materialise_unique will attempt to move any dentry. This is
+ appropriate for distributed filesystems, where finding a
+ directory other than where we last cached it may be a normal
+ consequence of concurrent operations on other hosts.
+ Both functions return NULL when the passed-in dentry is used,
+ following the calling convention of ->lookup.
+
Filesystem Issues
-----------------
@@ -120,12 +128,12 @@ struct which has the following members:
fh_to_dentry (mandatory)
Given a filehandle fragment, this should find the implied object and
- create a dentry for it (possibly with d_alloc_anon).
+ create a dentry for it (possibly with d_obtain_alias).
fh_to_parent (optional but strongly recommended)
Given a filehandle fragment, this should find the parent of the
- implied object and create a dentry for it (possibly with d_alloc_anon).
- May fail if the filehandle fragment is too small.
+ implied object and create a dentry for it (possibly with
+ d_obtain_alias). May fail if the filehandle fragment is too small.
get_parent (optional but strongly recommended)
When given a dentry for a directory, this should return a dentry for
diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index e386f7e4bce..724043858b0 100644
--- a/Documentation/filesystems/nfs/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
@@ -138,9 +138,9 @@ Installation
- Build, install, reboot
The NFS/RDMA code will be enabled automatically if NFS and RDMA
- are turned on. The NFS/RDMA client and server are configured via the hidden
- SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
- value of SUNRPC_XPRT_RDMA will be:
+ are turned on. The NFS/RDMA client and server are configured via the
+ SUNRPC_XPRT_RDMA_CLIENT and SUNRPC_XPRT_RDMA_SERVER config options that both
+ depend on SUNRPC and INFINIBAND. The default value of both options will be:
- N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
and server will not be built
@@ -235,8 +235,9 @@ NFS/RDMA Setup
- Start the NFS server
- If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
- kernel config), load the RDMA transport module:
+ If the NFS/RDMA server was built as a module
+ (CONFIG_SUNRPC_XPRT_RDMA_SERVER=m in kernel config), load the RDMA
+ transport module:
$ modprobe svcrdma
@@ -255,8 +256,9 @@ NFS/RDMA Setup
- On the client system
- If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
- kernel config), load the RDMA client module:
+ If the NFS/RDMA client was built as a module
+ (CONFIG_SUNRPC_XPRT_RDMA_CLIENT=m in kernel config), load the RDMA client
+ module:
$ modprobe xprtrdma.ko
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 61947facfc0..553f10d0307 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -14,7 +14,6 @@ Table of contents
- The Device-Mapper driver
- The Software RAID / MD driver
- Limitations when using the MD driver
-- ChangeLog
Overview
@@ -450,270 +449,3 @@ number of sectors BEFORE attempting to use it. You have been warned!
Even better is to simply use the Device-Mapper for linear raid and then you do
not have this problem with odd numbers of sectors.
-
-
-ChangeLog
-=========
-
-2.1.30:
- - Fix writev() (it kept writing the first segment over and over again
- instead of moving onto subsequent segments).
- - Fix crash in ntfs_mft_record_alloc() when mapping the new extent mft
- record failed.
-2.1.29:
- - Fix a deadlock when mounting read-write.
-2.1.28:
- - Fix a deadlock.
-2.1.27:
- - Implement page migration support so the kernel can move memory used
- by NTFS files and directories around for management purposes.
- - Add support for writing to sparse files created with Windows XP SP2.
- - Many minor improvements and bug fixes.
-2.1.26:
- - Implement support for sector sizes above 512 bytes (up to the maximum
- supported by NTFS which is 4096 bytes).
- - Enhance support for NTFS volumes which were supported by Windows but
- not by Linux due to invalid attribute list attribute flags.
- - A few minor updates and bug fixes.
-2.1.25:
- - Write support is now extended with write(2) being able to both
- overwrite existing file data and to extend files. Also, if a write
- to a sparse region occurs, write(2) will fill in the hole. Note,
- mmap(2) based writes still do not support writing into holes or
- writing beyond the initialized size.
- - Write support has a new feature and that is that truncate(2) and
- open(2) with O_TRUNC are now implemented thus files can be both made
- smaller and larger.
- - Note: Both write(2) and truncate(2)/open(2) with O_TRUNC still have
- limitations in that they
- - only provide limited support for highly fragmented files.
- - only work on regular, i.e. uncompressed and unencrypted files.
- - never create sparse files although this will change once directory
- operations are implemented.
- - Lots of bug fixes and enhancements across the board.
-2.1.24:
- - Support journals ($LogFile) which have been modified by chkdsk. This
- means users can boot into Windows after we marked the volume dirty.
- The Windows boot will run chkdsk and then reboot. The user can then
- immediately boot into Linux rather than having to do a full Windows
- boot first before rebooting into Linux and we will recognize such a
- journal and empty it as it is clean by definition.
- - Support journals ($LogFile) with only one restart page as well as
- journals with two different restart pages. We sanity check both and
- either use the only sane one or the more recent one of the two in the
- case that both are valid.
- - Lots of bug fixes and enhancements across the board.
-2.1.23:
- - Stamp the user space journal, aka transaction log, aka $UsnJrnl, if
- it is present and active thus telling Windows and applications using
- the transaction log that changes can have happened on the volume
- which are not recorded in $UsnJrnl.
- - Detect the case when Windows has been hibernated (suspended to disk)
- and if this is the case do not allow (re)mounting read-write to
- prevent data corruption when you boot back into the suspended
- Windows session.
- - Implement extension of resident files using the normal file write
- code paths, i.e. most very small files can be extended to be a little
- bit bigger but not by much.
- - Add new mount option "disable_sparse". (See list of mount options
- above for details.)
- - Improve handling of ntfs volumes with errors and strange boot sectors
- in particular.
- - Fix various bugs including a nasty deadlock that appeared in recent
- kernels (around 2.6.11-2.6.12 timeframe).
-2.1.22:
- - Improve handling of ntfs volumes with errors.
- - Fix various bugs and race conditions.
-2.1.21:
- - Fix several race conditions and various other bugs.
- - Many internal cleanups, code reorganization, optimizations, and mft
- and index record writing code rewritten to fit in with the changes.
- - Update Documentation/filesystems/ntfs.txt with instructions on how to
- use the Device-Mapper driver with NTFS ftdisk/LDM raid.
-2.1.20:
- - Fix two stupid bugs introduced in 2.1.18 release.
-2.1.19:
- - Minor bugfix in handling of the default upcase table.
- - Many internal cleanups and improvements. Many thanks to Linus
- Torvalds and Al Viro for the help and advice with the sparse
- annotations and cleanups.
-2.1.18:
- - Fix scheduling latencies at mount time. (Ingo Molnar)
- - Fix endianness bug in a little traversed portion of the attribute
- lookup code.
-2.1.17:
- - Fix bugs in mount time error code paths.
-2.1.16:
- - Implement access time updates (including mtime and ctime).
- - Implement fsync(2), fdatasync(2), and msync(2) system calls.
- - Enable the readv(2) and writev(2) system calls.
- - Enable access via the asynchronous io (aio) API by adding support for
- the aio_read(3) and aio_write(3) functions.
-2.1.15:
- - Invalidate quotas when (re)mounting read-write.
- NOTE: This now only leave user space journalling on the side. (See
- note for version 2.1.13, below.)
-2.1.14:
- - Fix an NFSd caused deadlock reported by several users.
-2.1.13:
- - Implement writing of inodes (access time updates are not implemented
- yet so mounting with -o noatime,nodiratime is enforced).
- - Enable writing out of resident files so you can now overwrite any
- uncompressed, unencrypted, nonsparse file as long as you do not
- change the file size.
- - Add housekeeping of ntfs system files so that ntfsfix no longer needs
- to be run after writing to an NTFS volume.
- NOTE: This still leaves quota tracking and user space journalling on
- the side but they should not cause data corruption. In the worst
- case the charged quotas will be out of date ($Quota) and some
- userspace applications might get confused due to the out of date
- userspace journal ($UsnJrnl).
-2.1.12:
- - Fix the second fix to the decompression engine from the 2.1.9 release
- and some further internals cleanups.
-2.1.11:
- - Driver internal cleanups.
-2.1.10:
- - Force read-only (re)mounting of volumes with unsupported volume
- flags and various cleanups.
-2.1.9:
- - Fix two bugs in handling of corner cases in the decompression engine.
-2.1.8:
- - Read the $MFT mirror and compare it to the $MFT and if the two do not
- match, force a read-only mount and do not allow read-write remounts.
- - Read and parse the $LogFile journal and if it indicates that the
- volume was not shutdown cleanly, force a read-only mount and do not
- allow read-write remounts. If the $LogFile indicates a clean
- shutdown and a read-write (re)mount is requested, empty $LogFile to
- ensure that Windows cannot cause data corruption by replaying a stale
- journal after Linux has written to the volume.
- - Improve time handling so that the NTFS time is fully preserved when
- converted to kernel time and only up to 99 nano-seconds are lost when
- kernel time is converted to NTFS time.
-2.1.7:
- - Enable NFS exporting of mounted NTFS volumes.
-2.1.6:
- - Fix minor bug in handling of compressed directories that fixes the
- erroneous "du" and "stat" output people reported.
-2.1.5:
- - Minor bug fix in attribute list attribute handling that fixes the
- I/O errors on "ls" of certain fragmented files found by at least two
- people running Windows XP.
-2.1.4:
- - Minor update allowing compilation with all gcc versions (well, the
- ones the kernel can be compiled with anyway).
-2.1.3:
- - Major bug fixes for reading files and volumes in corner cases which
- were being hit by Windows 2k/XP users.
-2.1.2:
- - Major bug fixes alleviating the hangs in statfs experienced by some
- users.
-2.1.1:
- - Update handling of compressed files so people no longer get the
- frequently reported warning messages about initialized_size !=
- data_size.
-2.1.0:
- - Add configuration option for developmental write support.
- - Initial implementation of file overwriting. (Writes to resident files
- are not written out to disk yet, so avoid writing to files smaller
- than about 1kiB.)
- - Intercept/abort changes in file size as they are not implemented yet.
-2.0.25:
- - Minor bugfixes in error code paths and small cleanups.
-2.0.24:
- - Small internal cleanups.
- - Support for sendfile system call. (Christoph Hellwig)
-2.0.23:
- - Massive internal locking changes to mft record locking. Fixes
- various race conditions and deadlocks.
- - Fix ntfs over loopback for compressed files by adding an
- optimization barrier. (gcc was screwing up otherwise ?)
- Thanks go to Christoph Hellwig for pointing these two out:
- - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
- - Fix ntfs_free() for ia64 and parisc.
-2.0.22:
- - Small internal cleanups.
-2.0.21:
- These only affect 32-bit architectures:
- - Check for, and refuse to mount too large volumes (maximum is 2TiB).
- - Check for, and refuse to open too large files and directories
- (maximum is 16TiB).
-2.0.20:
- - Support non-resident directory index bitmaps. This means we now cope
- with huge directories without problems.
- - Fix a page leak that manifested itself in some cases when reading
- directory contents.
- - Internal cleanups.
-2.0.19:
- - Fix race condition and improvements in block i/o interface.
- - Optimization when reading compressed files.
-2.0.18:
- - Fix race condition in reading of compressed files.
-2.0.17:
- - Cleanups and optimizations.
-2.0.16:
- - Fix stupid bug introduced in 2.0.15 in new attribute inode API.
- - Big internal cleanup replacing the mftbmp access hacks by using the
- new attribute inode API instead.
-2.0.15:
- - Bug fix in parsing of remount options.
- - Internal changes implementing attribute (fake) inodes allowing all
- attribute i/o to go via the page cache and to use all the normal
- vfs/mm functionality.
-2.0.14:
- - Internal changes improving run list merging code and minor locking
- change to not rely on BKL in ntfs_statfs().
-2.0.13:
- - Internal changes towards using iget5_locked() in preparation for
- fake inodes and small cleanups to ntfs_volume structure.
-2.0.12:
- - Internal cleanups in address space operations made possible by the
- changes introduced in the previous release.
-2.0.11:
- - Internal updates and cleanups introducing the first step towards
- fake inode based attribute i/o.
-2.0.10:
- - Microsoft says that the maximum number of inodes is 2^32 - 1. Update
- the driver accordingly to only use 32-bits to store inode numbers on
- 32-bit architectures. This improves the speed of the driver a little.
-2.0.9:
- - Change decompression engine to use a single buffer. This should not
- affect performance except perhaps on the most heavy i/o on SMP
- systems when accessing multiple compressed files from multiple
- devices simultaneously.
- - Minor updates and cleanups.
-2.0.8:
- - Remove now obsolete show_inodes and posix mount option(s).
- - Restore show_sys_files mount option.
- - Add new mount option case_sensitive, to determine if the driver
- treats file names as case sensitive or not.
- - Mostly drop support for short file names (for backwards compatibility
- we only support accessing files via their short file name if one
- exists).
- - Fix dcache aliasing issues wrt short/long file names.
- - Cleanups and minor fixes.
-2.0.7:
- - Just cleanups.
-2.0.6:
- - Major bugfix to make compatible with other kernel changes. This fixes
- the hangs/oopses on umount.
- - Locking cleanup in directory operations (remove BKL usage).
-2.0.5:
- - Major buffer overflow bug fix.
- - Minor cleanups and updates for kernel 2.5.12.
-2.0.4:
- - Cleanups and updates for kernel 2.5.11.
-2.0.3:
- - Small bug fixes, cleanups, and performance improvements.
-2.0.2:
- - Use default fmask of 0177 so that files are no executable by default.
- If you want owner executable files, just use fmask=0077.
- - Update for kernel 2.5.9 but preserve backwards compatibility with
- kernel 2.5.7.
- - Minor bug fixes, cleanups, and updates.
-2.0.1:
- - Minor updates, primarily set the executable bit by default on files
- so they can be executed.
-2.0.0:
- - Started ChangeLog.
-
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
new file mode 100644
index 00000000000..530850a7273
--- /dev/null
+++ b/Documentation/filesystems/overlayfs.txt
@@ -0,0 +1,198 @@
+Written by: Neil Brown <neilb@suse.de>
+
+Overlay Filesystem
+==================
+
+This document describes a prototype for a new approach to providing
+overlay-filesystem functionality in Linux (sometimes referred to as
+union-filesystems). An overlay-filesystem tries to present a
+filesystem which is the result over overlaying one filesystem on top
+of the other.
+
+The result will inevitably fail to look exactly like a normal
+filesystem for various technical reasons. The expectation is that
+many use cases will be able to ignore these differences.
+
+This approach is 'hybrid' because the objects that appear in the
+filesystem do not all appear to belong to that filesystem. In many
+cases an object accessed in the union will be indistinguishable
+from accessing the corresponding object from the original filesystem.
+This is most obvious from the 'st_dev' field returned by stat(2).
+
+While directories will report an st_dev from the overlay-filesystem,
+all non-directory objects will report an st_dev from the lower or
+upper filesystem that is providing the object. Similarly st_ino will
+only be unique when combined with st_dev, and both of these can change
+over the lifetime of a non-directory object. Many applications and
+tools ignore these values and will not be affected.
+
+Upper and Lower
+---------------
+
+An overlay filesystem combines two filesystems - an 'upper' filesystem
+and a 'lower' filesystem. When a name exists in both filesystems, the
+object in the 'upper' filesystem is visible while the object in the
+'lower' filesystem is either hidden or, in the case of directories,
+merged with the 'upper' object.
+
+It would be more correct to refer to an upper and lower 'directory
+tree' rather than 'filesystem' as it is quite possible for both
+directory trees to be in the same filesystem and there is no
+requirement that the root of a filesystem be given for either upper or
+lower.
+
+The lower filesystem can be any filesystem supported by Linux and does
+not need to be writable. The lower filesystem can even be another
+overlayfs. The upper filesystem will normally be writable and if it
+is it must support the creation of trusted.* extended attributes, and
+must provide valid d_type in readdir responses, so NFS is not suitable.
+
+A read-only overlay of two read-only filesystems may use any
+filesystem type.
+
+Directories
+-----------
+
+Overlaying mainly involves directories. If a given name appears in both
+upper and lower filesystems and refers to a non-directory in either,
+then the lower object is hidden - the name refers only to the upper
+object.
+
+Where both upper and lower objects are directories, a merged directory
+is formed.
+
+At mount time, the two directories given as mount options "lowerdir" and
+"upperdir" are combined into a merged directory:
+
+ mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\
+workdir=/work /merged
+
+The "workdir" needs to be an empty directory on the same filesystem
+as upperdir.
+
+Then whenever a lookup is requested in such a merged directory, the
+lookup is performed in each actual directory and the combined result
+is cached in the dentry belonging to the overlay filesystem. If both
+actual lookups find directories, both are stored and a merged
+directory is created, otherwise only one is stored: the upper if it
+exists, else the lower.
+
+Only the lists of names from directories are merged. Other content
+such as metadata and extended attributes are reported for the upper
+directory only. These attributes of the lower directory are hidden.
+
+whiteouts and opaque directories
+--------------------------------
+
+In order to support rm and rmdir without changing the lower
+filesystem, an overlay filesystem needs to record in the upper filesystem
+that files have been removed. This is done using whiteouts and opaque
+directories (non-directories are always opaque).
+
+A whiteout is created as a character device with 0/0 device number.
+When a whiteout is found in the upper level of a merged directory, any
+matching name in the lower level is ignored, and the whiteout itself
+is also hidden.
+
+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
+to "y". Where the upper filesystem contains an opaque directory, any
+directory in the lower filesystem with the same name is ignored.
+
+readdir
+-------
+
+When a 'readdir' request is made on a merged directory, the upper and
+lower directories are each read and the name lists merged in the
+obvious way (upper is read first, then lower - entries that already
+exist are not re-added). This merged name list is cached in the
+'struct file' and so remains as long as the file is kept open. If the
+directory is opened and read by two processes at the same time, they
+will each have separate caches. A seekdir to the start of the
+directory (offset 0) followed by a readdir will cause the cache to be
+discarded and rebuilt.
+
+This means that changes to the merged directory do not appear while a
+directory is being read. This is unlikely to be noticed by many
+programs.
+
+seek offsets are assigned sequentially when the directories are read.
+Thus if
+ - read part of a directory
+ - remember an offset, and close the directory
+ - re-open the directory some time later
+ - seek to the remembered offset
+
+there may be little correlation between the old and new locations in
+the list of filenames, particularly if anything has changed in the
+directory.
+
+Readdir on directories that are not merged is simply handled by the
+underlying directory (upper or lower).
+
+
+Non-directories
+---------------
+
+Objects that are not directories (files, symlinks, device-special
+files etc.) are presented either from the upper or lower filesystem as
+appropriate. When a file in the lower filesystem is accessed in a way
+the requires write-access, such as opening for write access, changing
+some metadata etc., the file is first copied from the lower filesystem
+to the upper filesystem (copy_up). Note that creating a hard-link
+also requires copy_up, though of course creation of a symlink does
+not.
+
+The copy_up may turn out to be unnecessary, for example if the file is
+opened for read-write but the data is not modified.
+
+The copy_up process first makes sure that the containing directory
+exists in the upper filesystem - creating it and any parents as
+necessary. It then creates the object with the same metadata (owner,
+mode, mtime, symlink-target etc.) and then if the object is a file, the
+data is copied from the lower to the upper filesystem. Finally any
+extended attributes are copied up.
+
+Once the copy_up is complete, the overlay filesystem simply
+provides direct access to the newly created file in the upper
+filesystem - future operations on the file are barely noticed by the
+overlay filesystem (though an operation on the name of the file such as
+rename or unlink will of course be noticed and handled).
+
+
+Non-standard behavior
+---------------------
+
+The copy_up operation essentially creates a new, identical file and
+moves it over to the old name. The new file may be on a different
+filesystem, so both st_dev and st_ino of the file may change.
+
+Any open files referring to this inode will access the old data and
+metadata. Similarly any file locks obtained before copy_up will not
+apply to the copied up file.
+
+On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and
+fsetxattr(2) will fail with EROFS.
+
+If a file with multiple hard links is copied up, then this will
+"break" the link. Changes will not be propagated to other names
+referring to the same inode.
+
+Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
+object in overlayfs will not contain valid absolute paths, only
+relative paths leading up to the filesystem's root. This will be
+fixed in the future.
+
+Some operations are not atomic, for example a crash during copy_up or
+rename will leave the filesystem in an inconsistent state. This will
+be addressed in the future.
+
+Changes to underlying filesystems
+---------------------------------
+
+Offline changes, when the overlay is not mounted, are allowed to either
+the upper or the lower trees.
+
+Changes to the underlying filesystems while part of a mounted overlay
+filesystem are not allowed. If the underlying filesystem is changed,
+the behavior of the overlay is undefined, though it will not result in
+a crash or deadlock.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 1fe0ccb1af5..8ea3e90ace0 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -235,6 +235,39 @@ be used for more than one file, you can store an arbitrary pointer in the
private field of the seq_file structure; that value can then be retrieved
by the iterator functions.
+There is also a wrapper function to seq_open() called seq_open_private(). It
+kmallocs a zero filled block of memory and stores a pointer to it in the
+private field of the seq_file structure, returning 0 on success. The
+block size is specified in a third parameter to the function, e.g.:
+
+ static int ct_open(struct inode *inode, struct file *file)
+ {
+ return seq_open_private(file, &ct_seq_ops,
+ sizeof(struct mystruct));
+ }
+
+There is also a variant function, __seq_open_private(), which is functionally
+identical except that, if successful, it returns the pointer to the allocated
+memory block, allowing further initialisation e.g.:
+
+ static int ct_open(struct inode *inode, struct file *file)
+ {
+ struct mystruct *p =
+ __seq_open_private(file, &ct_seq_ops, sizeof(*p));
+
+ if (!p)
+ return -ENOMEM;
+
+ p->foo = bar; /* initialize my stuff */
+ ...
+ p->baz = true;
+
+ return 0;
+ }
+
+A corresponding close function, seq_release_private() is available which
+frees the memory allocated in the corresponding open.
+
The other operations of interest - read(), llseek(), and release() - are
all implemented by the seq_file code itself. So a virtual file's
file_operations structure will look like:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index a1d0d7a3016..20bf204426c 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -237,7 +237,7 @@ noted. This means that most methods can block safely. All methods are
only called from a process context (i.e. not from an interrupt handler
or bottom half).
- alloc_inode: this method is called by inode_alloc() to allocate memory
+ alloc_inode: this method is called by alloc_inode() to allocate memory
for struct inode and initialize it. If this function is not
defined, a simple 'struct inode' is allocated. Normally
alloc_inode will be used to allocate a larger structure which
@@ -364,6 +364,7 @@ struct inode_operations {
int (*atomic_open)(struct inode *, struct dentry *, struct file *,
unsigned open_flag, umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+ int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
};
Again, all methods are called without any locks being held, unless
@@ -696,6 +697,12 @@ struct address_space_operations {
but instead uses bmap to find out where the blocks in the file
are and uses those addresses directly.
+ dentry_open: *WARNING: probably going away soon, do not use!* This is an
+ alternative to f_op->open(), the difference is that this method may open
+ a file not necessarily originating from the same filesystem as the one
+ i_op->open() was called on. It may be useful for stacking filesystems
+ which want to allow native I/O directly on underlying files.
+
invalidatepage: If a page has PagePrivate set, then invalidatepage
will be called when part or all of the page is to be removed
@@ -826,7 +833,7 @@ struct file_operations {
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
- int (*setlease)(struct file *, long arg, struct file_lock **);
+ int (*setlease)(struct file *, long arg, struct file_lock **, void **);
long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
int (*show_fdinfo)(struct seq_file *m, struct file *f);
};
@@ -895,8 +902,9 @@ otherwise noted.
splice_read: called by the VFS to splice data from file to a pipe. This
method is used by the splice(2) system call
- setlease: called by the VFS to set or release a file lock lease.
- setlease has the file_lock_lock held and must not sleep.
+ setlease: called by the VFS to set or release a file lock lease. setlease
+ implementations should call generic_setlease to record or remove
+ the lease in the inode after setting it.
fallocate: called by the VFS to preallocate blocks or punch a hole.
@@ -1053,7 +1061,8 @@ struct dentry_operations {
If the 'rcu_walk' parameter is true, then the caller is doing a
pathwalk in RCU-walk mode. Sleeping is not permitted in this mode,
and the caller can be asked to leave it and call again by returning
- -ECHILD.
+ -ECHILD. -EISDIR may also be returned to tell pathwalk to
+ ignore d_automount or any mounts.
This function is only used if DCACHE_MANAGE_TRANSIT is set on the
dentry being transited from.