10 files changed, 365 insertions, 58 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 875d49696b6..3bae418c6ad 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -32,6 +32,8 @@ dlmfs.txt
 	- info on the userspace interface to the OCFS2 DLM.
 dnotify.txt
 	- info about directory notification in Linux.
+dnotify_test.c
+	- example program for dnotify
 ecryptfs.txt
 	- docs on eCryptfs: stacked cryptographic filesystem for Linux.
 exofs.txt
@@ -62,6 +64,8 @@ jfs.txt
 	- info and mount options for the JFS filesystem.
 locks.txt
 	- info on file locking implementations, flock() vs. fcntl(), etc.
+logfs.txt
+	- info on the LogFS flash filesystem.
 mandatory-locking.txt
 	- info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca063..06bbbed7120 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,13 +460,6 @@ in sys_read() and friends.
 
 --------------------------- dquot_operations -------------------------------
 prototypes:
-	int (*initialize) (struct inode *, int);
-	int (*drop) (struct inode *);
-	int (*alloc_space) (struct inode *, qsize_t, int);
-	int (*alloc_inode) (const struct inode *, unsigned long);
-	int (*free_space) (struct inode *, qsize_t);
-	int (*free_inode) (const struct inode *, unsigned long);
-	int (*transfer) (struct inode *, struct iattr *);
 	int (*write_dquot) (struct dquot *);
 	int (*acquire_dquot) (struct dquot *);
 	int (*release_dquot) (struct dquot *);
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
 What filesystem should expect from the generic quota functions:
 
 		FS recursion	Held locks when called
-initialize:	yes		maybe dqonoff_sem
-drop:		yes		-
-alloc_space:	->mark_dirty()	-
-alloc_inode:	->mark_dirty()	-
-free_space:	->mark_dirty()	-
-free_inode:	->mark_dirty()	-
-transfer:	yes		-
 write_dquot:	yes		dqonoff_sem or dqptr_sem
 acquire_dquot:	yes		dqonoff_sem or dqptr_sem
 release_dquot:	yes		dqonoff_sem or dqptr_sem
@@ -495,10 +481,6 @@ write_info:	yes		dqonoff_sem
 FS recursion means calling ->quota_read() and ->quota_write() from superblock
 operations.
 
-->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
 More details about quota locking can be found in fs/dquot.c.
 
 --------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
new file mode 100644
index 00000000000..a5dd114da14
--- /dev/null
+++ b/Documentation/filesystems/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := dnotify_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
index 9f5d338ddbb..6baf88f4685 100644
--- a/Documentation/filesystems/dnotify.txt
+++ b/Documentation/filesystems/dnotify.txt
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
 
 Example
 -------
+See Documentation/filesystems/dnotify_test.c for an example.
 
-	#define _GNU_SOURCE	/* needed to get the defines */
-	#include <fcntl.h>	/* in glibc 2.2 this has the needed
-					   values defined */
-	#include <signal.h>
-	#include <stdio.h>
-	#include <unistd.h>
-
-	static volatile int event_fd;
-
-	static void handler(int sig, siginfo_t *si, void *data)
-	{
-		event_fd = si->si_fd;
-	}
-
-	int main(void)
-	{
-		struct sigaction act;
-		int fd;
-
-		act.sa_sigaction = handler;
-		sigemptyset(&act.sa_mask);
-		act.sa_flags = SA_SIGINFO;
-		sigaction(SIGRTMIN + 1, &act, NULL);
-
-		fd = open(".", O_RDONLY);
-		fcntl(fd, F_SETSIG, SIGRTMIN + 1);
-		fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
-		/* we will now be notified if any of the files
-		   in "." is modified or new files are created */
-		while (1) {
-			pause();
-			printf("Got event on fd=%d\n", event_fd);
-		}
-	}
+NOTE
+----
+Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
+See Documentation/filesystems/inotify.txt for more information on it.
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c
new file mode 100644
index 00000000000..8b37b4a1e18
--- /dev/null
+++ b/Documentation/filesystems/dnotify_test.c
@@ -0,0 +1,34 @@
+#define _GNU_SOURCE	/* needed to get the defines */
+#include <fcntl.h>	/* in glibc 2.2 this has the needed
+				   values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+static volatile int event_fd;
+
+static void handler(int sig, siginfo_t *si, void *data)
+{
+	event_fd = si->si_fd;
+}
+
+int main(void)
+{
+	struct sigaction act;
+	int fd;
+
+	act.sa_sigaction = handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_SIGINFO;
+	sigaction(SIGRTMIN + 1, &act, NULL);
+
+	fd = open(".", O_RDONLY);
+	fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+	fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+	/* we will now be notified if any of the files
+	   in "." is modified or new files are created */
+	while (1) {
+		pause();
+		printf("Got event on fd=%d\n", event_fd);
+	}
+}
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 00000000000..e64c94ba401
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
+
+The LogFS Flash Filesystem
+==========================
+
+Specification
+=============
+
+Superblocks
+-----------
+
+Two superblocks exist at the beginning and end of the filesystem.
+Each superblock is 256 Bytes large, with another 3840 Bytes reserved
+for future purposes, making a total of 4096 Bytes.
+
+Superblock locations may differ for MTD and block devices.  On MTD the
+first non-bad block contains a superblock in the first 4096 Bytes and
+the last non-bad block contains a superblock in the last 4096 Bytes.
+On block devices, the first 4096 Bytes of the device contain the first
+superblock and the last aligned 4096 Byte-block contains the second
+superblock.
+
+For the most part, the superblocks can be considered read-only.  They
+are written only to correct errors detected within the superblocks,
+move the journal and change the filesystem parameters through tunefs.
+As a result, the superblock does not contain any fields that require
+constant updates, like the amount of free space, etc.
+
+Segments
+--------
+
+The space in the device is split up into equal-sized segments.
+Segments are the primary write unit of LogFS.  Within each segments,
+writes happen from front (low addresses) to back (high addresses.  If
+only a partial segment has been written, the segment number, the
+current position within and optionally a write buffer are stored in
+the journal.
+
+Segments are erased as a whole.  Therefore Garbage Collection may be
+required to completely free a segment before doing so.
+
+Journal
+--------
+
+The journal contains all global information about the filesystem that
+is subject to frequent change.  At mount time, it has to be scanned
+for the most recent commit entry, which contains a list of pointers to
+all currently valid entries.
+
+Object Store
+------------
+
+All space except for the superblocks and journal is part of the object
+store.  Each segment contains a segment header and a number of
+objects, each consisting of the object header and the payload.
+Objects are either inodes, directory entries (dentries), file data
+blocks or indirect blocks.
+
+Levels
+------
+
+Garbage collection (GC) may fail if all data is written
+indiscriminately.  One requirement of GC is that data is seperated
+roughly according to the distance between the tree root and the data.
+Effectively that means all file data is on level 0, indirect blocks
+are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
+respectively.  Inode file data is on level 6 for the inodes and 7-11
+for indirect blocks.
+
+Each segment contains objects of a single level only.  As a result,
+each level requires its own seperate segment to be open for writing.
+
+Inode File
+----------
+
+All inodes are stored in a special file, the inode file.  Single
+exception is the inode file's inode (master inode) which for obvious
+reasons is stored in the journal instead.  Instead of data blocks, the
+leaf nodes of the inode files are inodes.
+
+Aliases
+-------
+
+Writes in LogFS are done by means of a wandering tree.  A naïve
+implementation would require that for each write or a block, all
+parent blocks are written as well, since the block pointers have
+changed.  Such an implementation would not be very efficient.
+
+In LogFS, the block pointer changes are cached in the journal by means
+of alias entries.  Each alias consists of its logical address - inode
+number, block index, level and child number (index into block) - and
+the changed data.  Any 8-byte word can be changes in this manner.
+
+Currently aliases are used for block pointers, file size, file used
+bytes and the height of an inodes indirect tree.
+
+Segment Aliases
+---------------
+
+Related to regular aliases, these are used to handle bad blocks.
+Initially, bad blocks are handled by moving the affected segment
+content to a spare segment and noting this move in the journal with a
+segment alias, a simple (to, from) tupel.  GC will later empty this
+segment and the alias can be removed again.  This is used on MTD only.
+
+Vim
+---
+
+By cleverly predicting the life time of data, it is possible to
+seperate long-living data from short-living data and thereby reduce
+the GC overhead later.  Each type of distinc life expectency (vim) can
+have a seperate segment open for writing.  Each (level, vim) tupel can
+be open just once.  If an open segment with unknown vim is encountered
+at mount time, it is closed and ignored henceforth.
+
+Indirect Tree
+-------------
+
+Inodes in LogFS are similar to FFS-style filesystems with direct and
+indirect block pointers.  One difference is that LogFS uses a single
+indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
+A height field in the inode defines the height of the indirect tree
+and thereby the indirection of the pointer.
+
+Another difference is the addressing of indirect blocks.  In LogFS,
+the first 16 pointers in the first indirect block are left empty,
+corresponding to the 16 direct pointers in the inode.  In ext2 (maybe
+others as well) the first pointer in the first indirect block
+corresponds to logical block 12, skipping the 12 direct pointers.
+So where ext2 is using arithmetic to better utilize space, LogFS keeps
+arithmetic simple and uses compression to save space.
+
+Compression
+-----------
+
+Both file data and metadata can be compressed.  Compression for file
+data can be enabled with chattr +c and disabled with chattr -c.  Doing
+so has no effect on existing data, but new data will be stored
+accordingly.  New inodes will inherit the compression flag of the
+parent directory.
+
+Metadata is always compressed.  However, the space accounting ignores
+this and charges for the uncompressed size.  Failing to do so could
+result in GC failures when, after moving some data, indirect blocks
+compress worse than previously.  Even on a 100% full medium, GC may
+not consume any extra space, so the compression gains are lost space
+to the user.
+
+However, they are not lost space to the filesystem internals.  By
+cheating the user for those bytes, the filesystem gained some slack
+space and GC will run less often and faster.
+
+Garbage Collection and Wear Leveling
+------------------------------------
+
+Garbage collection is invoked whenever the number of free segments
+falls below a threshold.  The best (known) candidate is picked based
+on the least amount of valid data contained in the segment.  All
+remaining valid data is copied elsewhere, thereby invalidating it.
+
+The GC code also checks for aliases and writes then back if their
+number gets too large.
+
+Wear leveling is done by occasionally picking a suboptimal segment for
+garbage collection.  If a stale segments erase count is significantly
+lower than the active segments' erase counts, it will be picked.  Wear
+leveling is rate limited, so it will never monopolize the device for
+more than one segment worth at a time.
+
+Values for "occasionally", "significantly lower" are compile time
+constants.
+
+Hashed directories
+------------------
+
+To satisfy efficient lookup(), directory entries are hashed and
+located based on the hash.  In order to both support large directories
+and not be overly inefficient for small directories, several hash
+tables of increasing size are used.  For each table, the hash value
+modulo the table size gives the table index.
+
+Tables sizes are chosen to limit the number of indirect blocks with a
+fully populated table to 0, 1, 2 or 3 respectively.  So the first
+table contains 16 entries, the second 512-16, etc.
+
+The last table is special in several ways.  First its size depends on
+the effective 32bit limit on telldir/seekdir cookies.  Since logfs
+uses the upper half of the address space for indirect blocks, the size
+is limited to 2^31.  Secondly the table contains hash buckets with 16
+entries each.
+
+Using single-entry buckets would result in birthday "attacks".  At
+just 2^16 used entries, hash collisions would be likely (P >= 0.5).
+My math skills are insufficient to do the combinatorics for the 17x
+collisions necessary to overflow a bucket, but testing showed that in
+10,000 runs the lowest directory fill before a bucket overflow was
+188,057,130 entries with an average of 315,149,915 entries.  So for
+directory sizes of up to a million, bucket overflows should be
+virtually impossible under normal circumstances.
+
+With carefully chosen filenames, it is obviously possible to cause an
+overflow with just 21 entries (4 higher tables + 16 entries + 1).  So
+there may be a security concern if a malicious user has write access
+to a directory.
+
+Open For Discussion
+===================
+
+Device Address Space
+--------------------
+
+A device address space is used for caching.  Both block devices and
+MTD provide functions to either read a single page or write a segment.
+Partial segments may be written for data integrity, but where possible
+complete segments are written for performance on simple block device
+flash media.
+
+Meta Inodes
+-----------
+
+Inodes are stored in the inode file, which is just a regular file for
+most purposes.  At umount time, however, the inode file needs to
+remain open until all dirty inodes are written.  So
+generic_shutdown_super() may not close this inode, but shouldn't
+complain about remaining inodes due to the inode file either.  Same
+goes for mapping inode of the device address space.
+
+Currently logfs uses a hack that essentially copies part of fs/inode.c
+code over.  A general solution would be preferred.
+
+Indirect block mapping
+----------------------
+
+With compression, the block device (or mapping inode) cannot be used
+to cache indirect blocks.  Some other place is required.  Currently
+logfs uses the top half of each inode's address space.  The low 8TB
+(on 32bit) are filled with file data, the high 8TB are used for
+indirect blocks.
+
+One problem is that 16TB files created on 64bit systems actually have
+data in the top 8TB.  But files >16TB would cause problems anyway, so
+only the limit has changed.
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 1bd0d0c0517..6a53a84afc7 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
 on or off; rpc.nfsd does this correctly.)
 
 The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
-on the latest NFSv4.1 Internet Draft:
-http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
+on RFC 5661.
 
 From the many new features in NFSv4.1 the current implementation
 focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -44,7 +43,7 @@ interoperability problems with future clients.  Known issues:
 	  trunking, but this is a mandatory feature, and its use is
 	  recommended to clients in a number of places.  (E.g. to ensure
 	  timely renewal in case an existing connection's retry timeouts
-	  have gotten too long; see section 8.3 of the draft.)
+	  have gotten too long; see section 8.3 of the RFC.)
 	  Therefore, lack of this feature may cause future clients to
 	  fail.
 	- Incomplete backchannel support: incomplete backchannel gss
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 839efd8a8a8..cf6d0d85ca8 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -74,6 +74,9 @@ norecovery		Disable recovery of the filesystem on mount.
 			This disables every write access on the device for
 			read-only mounts or snapshots.  This option will fail
 			for r/w mounts on an unclean volume.
+discard			Issue discard/TRIM commands to the underlying block
+			device when blocks are freed.  This is useful for SSD
+			devices and sparse/thinly-provisioned LUNs.
 
 NILFS2 usage
 ============
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d07513a67a..a4f30faa4f1 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -164,6 +164,7 @@ read the file /proc/PID/status:
   VmExe:        68 kB
   VmLib:      1412 kB
   VmPTE:        20 kb
+  VmSwap:        0 kB
   Threads:        1
   SigQ:   0/28578
   SigPnd: 0000000000000000
@@ -188,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 contains details information about the process itself.  Its fields are
 explained in Table 1-4.
 
-Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
+
+Table 1-2: Contents of the status files (as of 2.6.30-rc7)
 ..............................................................................
  Field                       Content
  Name                        filename of the executable
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
+ VmSwap                      size of swap usage (the number of referred swapents)
  Threads                     number of threads
  SigQ                        number of signals queued/max. number for queue
  SigPnd                      bitmap of pending signals for the thread
@@ -430,6 +438,7 @@ Table 1-5: Kernel info in /proc
  modules     List of loaded modules                            
  mounts      Mounted filesystems                               
  net         Networking info (see text)                        
+ pagetypeinfo Additional page allocator information (see text)  (2.5)
  partitions  Table of partitions known to the system           
  pci	     Deprecated info of PCI bus (new way -> /proc/bus/pci/,
              decoupled by lspci					(2.4)
@@ -584,7 +593,7 @@ Node 0, zone      DMA      0      4      5      4      4      3 ...
 Node 0, zone   Normal      1      0      0      1    101      8 ...
 Node 0, zone  HighMem      2      0      0      1      1      0 ...
 
-Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+External fragmentation is a problem under some workloads, and buddyinfo is a
 useful tool for helping diagnose these problems.  Buddyinfo will give you a 
 clue as to how big an area you can safely allocate, or why a previous
 allocation failed.
@@ -594,6 +603,48 @@ available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
 ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
 available in ZONE_NORMAL, etc... 
 
+More information relevant to external fragmentation can be found in
+pagetypeinfo.
+
+> cat /proc/pagetypeinfo
+Page block order: 9
+Pages per block:  512
+
+Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
+Node    0, zone      DMA, type    Unmovable      0      0      0      1      1      1      1      1      1      1      0
+Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone      DMA, type      Movable      1      1      2      1      2      1      1      0      1      0      2
+Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      1      0
+Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone    DMA32, type    Unmovable    103     54     77      1      1      1     11      8      7      1      9
+Node    0, zone    DMA32, type  Reclaimable      0      0      2      1      0      0      0      0      1      0      0
+Node    0, zone    DMA32, type      Movable    169    152    113     91     77     54     39     13      6      1    452
+Node    0, zone    DMA32, type      Reserve      1      2      2      2      2      0      1      1      1      1      0
+Node    0, zone    DMA32, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+
+Number of blocks type     Unmovable  Reclaimable      Movable      Reserve      Isolate
+Node 0, zone      DMA            2            0            5            1            0
+Node 0, zone    DMA32           41            6          967            2            0
+
+Fragmentation avoidance in the kernel works by grouping pages of different
+migrate types into the same contiguous regions of memory called page blocks.
+A page block is typically the size of the default hugepage size e.g. 2MB on
+X86-64. By keeping pages grouped based on their ability to move, the kernel
+can reclaim pages within a page block to satisfy a high-order allocation.
+
+The pagetypinfo begins with information on the size of a page block. It
+then gives the same type of information as buddyinfo except broken down
+by migrate-type and finishes with details on how many page blocks of each
+type exist.
+
+If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
+from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+make an estimate of the likely number of huge pages that can be allocated
+at a given point in time. All the "Movable" blocks should be allocatable
+unless memory has been mlock()'d. Some of the Reclaimable blocks should
+also be allocatable although a lot of filesystem metadata may have to be
+reclaimed to achieve this.
+
 ..............................................................................
 
 meminfo:
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 23a181074f9..fc0e39af43c 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -837,6 +837,9 @@ replicas continue to be exactly same.
 	 individual lists does not affect propagation or the way propagation
 	 tree is modified by operations.
 
+	All vfsmounts in a peer group have the same ->mnt_master.  If it is
+	non-NULL, they form a contiguous (ordered) segment of slave list.
+
 	A example propagation tree looks as shown in the figure below.
 	[ NOTE: Though it looks like a forest, if we consider all the shared
 	mounts as a conceptual entity called 'pnode', it becomes a tree]
@@ -874,8 +877,19 @@ replicas continue to be exactly same.
 
 	NOTE: The propagation tree is orthogonal to the mount tree.
 
+8B Locking:
+
+	->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
+	by namespace_sem (exclusive for modifications, shared for reading).
+
+	Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
+	There are two exceptions: do_add_mount() and clone_mnt().
+	The former modifies a vfsmount that has not been visible in any shared
+	data structures yet.
+	The latter holds namespace_sem and the only references to vfsmount
+	are in lists that can't be traversed without namespace_sem.
 
-8B Algorithm:
+8C Algorithm:
 
 	The crux of the implementation resides in rbind/move operation.