summaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking9
-rw-r--r--Documentation/filesystems/bfs.txt10
-rw-r--r--Documentation/filesystems/configfs/configfs_example.c4
-rw-r--r--Documentation/filesystems/ext4.txt137
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt114
-rw-r--r--Documentation/filesystems/nfs-rdma.txt103
-rw-r--r--Documentation/filesystems/omfs.txt106
-rw-r--r--Documentation/filesystems/proc.txt77
-rw-r--r--Documentation/filesystems/relay.txt10
-rw-r--r--Documentation/filesystems/sysfs-pci.txt1
-rw-r--r--Documentation/filesystems/sysfs.txt6
-rw-r--r--Documentation/filesystems/ubifs.txt164
-rw-r--r--Documentation/filesystems/vfat.txt8
-rw-r--r--Documentation/filesystems/vfs.txt10
14 files changed, 634 insertions, 125 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index c2992bc54f2..680fb566b92 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -92,7 +92,6 @@ prototypes:
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *);
int (*write_inode) (struct inode *, int);
- void (*put_inode) (struct inode *);
void (*drop_inode) (struct inode *);
void (*delete_inode) (struct inode *);
void (*put_super) (struct super_block *);
@@ -115,7 +114,6 @@ alloc_inode: no no no
destroy_inode: no
dirty_inode: no (must not sleep)
write_inode: no
-put_inode: no
drop_inode: no !!!inode_lock!!!
delete_inode: no
put_super: yes yes no
@@ -512,6 +510,7 @@ prototypes:
void (*close)(struct vm_area_struct*);
int (*fault)(struct vm_area_struct*, struct vm_fault *);
int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+ int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
locking rules:
BKL mmap_sem PageLocked(page)
@@ -519,6 +518,7 @@ open: no yes
close: no yes
fault: no yes
page_mkwrite: no yes no
+access: no yes
->page_mkwrite() is called when a previously read-only page is
about to become writeable. The file system is responsible for
@@ -527,6 +527,11 @@ taking to lock out truncate, the page range should be verified to be
within i_size. The page mapping should also be checked that it is not
NULL.
+ ->access() is called when get_user_pages() fails in
+acces_process_vm(), typically used to debug a process through
+/proc/pid/mem or ptrace. This function is needed only for
+VM_IO | VM_PFNMAP VMAs.
+
================================================================================
Dubious stuff
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt
index ea825e178e7..78043d5a8fc 100644
--- a/Documentation/filesystems/bfs.txt
+++ b/Documentation/filesystems/bfs.txt
@@ -26,11 +26,11 @@ You can simplify mounting by just typing:
this will allocate the first available loopback device (and load loop.o
kernel module if necessary) automatically. If the loopback driver is not
-loaded automatically, make sure that your kernel is compiled with kmod
-support (CONFIG_KMOD) enabled. Beware that umount will not
-deallocate /dev/loopN device if /etc/mtab file on your system is a
-symbolic link to /proc/mounts. You will need to do it manually using
-"-d" switch of losetup(8). Read losetup(8) manpage for more info.
+loaded automatically, make sure that you have compiled the module and
+that modprobe is functioning. Beware that umount will not deallocate
+/dev/loopN device if /etc/mtab file on your system is a symbolic link to
+/proc/mounts. You will need to do it manually using "-d" switch of
+losetup(8). Read losetup(8) manpage for more info.
To create the BFS image under UnixWare you need to find out first which
slice contains it. The command prtvtoc(1M) is your friend:
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 25151fd5c2c..03964879170 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -279,7 +279,7 @@ static struct config_item *simple_children_make_item(struct config_group *group,
simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
if (!simple_child)
- return NULL;
+ return ERR_PTR(-ENOMEM);
config_item_init_type_name(&simple_child->item, name,
@@ -366,7 +366,7 @@ static struct config_group *group_children_make_group(struct config_group *group
simple_children = kzalloc(sizeof(struct simple_children),
GFP_KERNEL);
if (!simple_children)
- return NULL;
+ return ERR_PTR(-ENOMEM);
config_group_init_type_name(&simple_children->group, name,
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 560f88dc709..80e193d82e2 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
1. Quick usage instructions:
===========================
- - Grab updated e2fsprogs from
- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
- This is a patchset on top of e2fsprogs-1.39, which can be found at
+ - Compile and install the latest version of e2fsprogs (as of this
+ writing version 1.41) from:
+
+ http://sourceforge.net/project/showfiles.php?group_id=2406
+
+ or
+
ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
- - It's still mke2fs -j /dev/hda1
+ or grab the latest git repository from:
+
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+ - Create a new filesystem using the ext4dev filesystem type:
+
+ # mke2fs -t ext4dev /dev/hda1
+
+ Or configure an existing ext3 filesystem to support extents and set
+ the test_fs flag to indicate that it's ok for an in-development
+ filesystem to touch this filesystem:
- - mount /dev/hda1 /wherever -t ext4dev
+ # tune2fs -O extents -E test_fs /dev/hda1
- - To enable extents,
+ If the filesystem was created with 128 byte inodes, it can be
+ converted to use 256 byte for greater efficiency via:
- mount /dev/hda1 /wherever -t ext4dev -o extents
+ # tune2fs -I 256 /dev/hda1
- - The filesystem is compatible with the ext3 driver until you add a file
- which has extents (ie: `mount -o extents', then create a file).
+ (Note: we currently do not have tools to convert an ext4dev
+ filesystem back to ext3; so please do not do try this on production
+ filesystems.)
- NOTE: The "extents" mount flag is temporary. It will soon go away and
- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+ - Mounting:
+
+ # mount -t ext4dev /dev/hda1 /wherever
- When comparing performance with other filesystems, remember that
- ext3/4 by default offers higher data integrity guarantees than most. So
- when comparing with a metadata-only journalling filesystem, use `mount -o
- data=writeback'. And you might as well use `mount -o nobh' too along
- with it. Making the journal larger than the mke2fs default often helps
- performance with metadata-intensive workloads.
+ ext3/4 by default offers higher data integrity guarantees than most.
+ So when comparing with a metadata-only journalling filesystem, such
+ as ext3, use `mount -o data=writeback'. And you might as well use
+ `mount -o nobh' too along with it. Making the journal larger than
+ the mke2fs default often helps performance with metadata-intensive
+ workloads.
2. Features
===========
2.1 Currently available
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
* extent format reduces metadata overhead (RAM, IO for access, transactions)
* extent format more robust in face of on-disk corruption due to magics,
* internal redunancy in tree
-
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
-
-* dir_index and resize inode will be on by default
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+ flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+ the ordering)
2.2 Candidate features for future inclusion
-There are several under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them:
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+ the uninit_bg feature (capability to do this is available in e2fsprogs
+ but a kernel thread to do lazy zeroing of unused inode table blocks
+ after filesystem is first mounted is required for safety)
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
- needs some e2fsck work)
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
-Features like metadata checksumming have been discussed and planned for
-a bit but no patches exist yet so I'm not sure they're in the near-term
-roadmap.
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables. Some test results available here:
-The big performance win will come with mballoc and delalloc. CFS has
-been using mballoc for a few years already with Lustre, and IBM + Bull
-did a lot of benchmarking on it. The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes. I believe
-Alex is working on a new set of patches right now.
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
3. Options
==========
@@ -139,8 +160,16 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
Setting it to very large values will improve
performance.
-barrier=1 This enables/disables barriers. barrier=0 disables
- it, barrier=1 enables it.
+barrier=<0|1(*)> This enables/disables the use of write barriers in
+ the jbd code. barrier=0 disables, barrier=1 enables.
+ This also requires an IO stack which can support
+ barriers, and if jbd gets an error on a barrier
+ write, it will disable again with a warning.
+ Write barriers enforce proper on-disk ordering
+ of journal commits, making volatile disk write caches
+ safe to use, at some performance penalty. If
+ your disks are battery-backed in one way or another,
+ disabling barriers may safely improve performance.
orlov (*) This enables the new Orlov block allocator. It is
enabled by default.
@@ -214,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
-
+delalloc (*) Deferring block allocation until write-out time.
+nodelalloc Disable delayed allocation. Blocks are allocation
+ when data is copied from user to page cache.
Data Mode
----------
+=========
There are 3 different data modes:
* writeback mode
@@ -228,10 +259,10 @@ typically provide the best ext4 performance.
* ordered mode
In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction. When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first. In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction. When it's time to write the new metadata
+out to disk, the associated data blocks are written first. In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
* journal mode
data=journal mode provides full data and metadata journaling. All new data is
@@ -239,7 +270,8 @@ written to the journal first, and then to its final location.
In the event of a crash, the journal can be replayed, bringing both data and
metadata into a consistent state. This mode is the slowest except when data
needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes. Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
References
==========
@@ -248,7 +280,8 @@ kernel source: <file:fs/ext4/>
<file:fs/jbd2/>
programs: http://e2fsprogs.sourceforge.net/
- http://ext2resize.sourceforge.net
useful links: http://fedoraproject.org/wiki/ext3-devel
http://www.bullopensource.org/ext4/
+ http://ext4.wiki.kernel.org/index.php/Main_Page
+ http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
new file mode 100644
index 00000000000..4dae9a3840b
--- /dev/null
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -0,0 +1,114 @@
+ Glock internal locking rules
+ ------------------------------
+
+This documents the basic principles of the glock state machine
+internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h)
+has two main (internal) locks:
+
+ 1. A spinlock (gl_spin) which protects the internal state such
+ as gl_state, gl_target and the list of holders (gl_holders)
+ 2. A non-blocking bit lock, GLF_LOCK, which is used to prevent other
+ threads from making calls to the DLM, etc. at the same time. If a
+ thread takes this lock, it must then call run_queue (usually via the
+ workqueue) when it releases it in order to ensure any pending tasks
+ are completed.
+
+The gl_holders list contains all the queued lock requests (not
+just the holders) associated with the glock. If there are any
+held locks, then they will be contiguous entries at the head
+of the list. Locks are granted in strictly the order that they
+are queued, except for those marked LM_FLAG_PRIORITY which are
+used only during recovery, and even then only for journal locks.
+
+There are three lock states that users of the glock layer can request,
+namely shared (SH), deferred (DF) and exclusive (EX). Those translate
+to the following DLM lock modes:
+
+Glock mode | DLM lock mode
+------------------------------
+ UN | IV/NL Unlocked (no DLM lock associated with glock) or NL
+ SH | PR (Protected read)
+ DF | CW (Concurrent write)
+ EX | EX (Exclusive)
+
+Thus DF is basically a shared mode which is incompatible with the "normal"
+shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O
+operations. The glocks are basically a lock plus some routines which deal
+with cache management. The following rules apply for the cache:
+
+Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata
+--------------------------------------------------------------------------
+ UN | No | No | No | No
+ SH | Yes | Yes | No | No
+ DF | No | Yes | No | No
+ EX | Yes | Yes | Yes | Yes
+
+These rules are implemented using the various glock operations which
+are defined for each type of glock. Not all types of glocks use
+all the modes. Only inode glocks use the DF mode for example.
+
+Table of glock operations and per type constants:
+
+Field | Purpose
+----------------------------------------------------------------------------
+go_xmote_th | Called before remote state change (e.g. to sync dirty data)
+go_xmote_bh | Called after remote state change (e.g. to refill cache)
+go_inval | Called if remote state change requires invalidating the cache
+go_demote_ok | Returns boolean value of whether its ok to demote a glock
+ | (e.g. checks timeout, and that there is no cached data)
+go_lock | Called for the first local holder of a lock
+go_unlock | Called on the final local unlock of a lock
+go_dump | Called to print content of object for debugfs file, or on
+ | error to dump glock to the log.
+go_type; | The type of the glock, LM_TYPE_.....
+go_min_hold_time | The minimum hold time
+
+The minimum hold time for each lock is the time after a remote lock
+grant for which we ignore remote demote requests. This is in order to
+prevent a situation where locks are being bounced around the cluster
+from node to node with none of the nodes making any progress. This
+tends to show up most with shared mmaped files which are being written
+to by multiple nodes. By delaying the demotion in response to a
+remote callback, that gives the userspace program time to make
+some progress before the pages are unmapped.
+
+There is a plan to try and remove the go_lock and go_unlock callbacks
+if possible, in order to try and speed up the fast path though the locking.
+Also, eventually we hope to make the glock "EX" mode locally shared
+such that any local locking will be done with the i_mutex as required
+rather than via the glock.
+
+Locking rules for glock operations:
+
+Operation | GLF_LOCK bit lock held | gl_spin spinlock held
+-----------------------------------------------------------------
+go_xmote_th | Yes | No
+go_xmote_bh | Yes | No
+go_inval | Yes | No
+go_demote_ok | Sometimes | Yes
+go_lock | Yes | No
+go_unlock | Yes | No
+go_dump | Sometimes | Yes
+
+N.B. Operations must not drop either the bit lock or the spinlock
+if its held on entry. go_dump and do_demote_ok must never block.
+Note that go_dump will only be called if the glock's state
+indicates that it is caching uptodate data.
+
+Glock locking order within GFS2:
+
+ 1. i_mutex (if required)
+ 2. Rename glock (for rename only)
+ 3. Inode glock(s)
+ (Parents before children, inodes at "same level" with same parent in
+ lock number order)
+ 4. Rgrp glock(s) (for (de)allocation operations)
+ 5. Transaction glock (via gfs2_trans_begin) for non-read operations
+ 6. Page lock (always last, very important!)
+
+There are two glocks per inode. One deals with access to the inode
+itself (locking order as above), and the other, known as the iopen
+glock is used in conjunction with the i_nlink field in the inode to
+determine the lifetime of the inode in question. Locking of inodes
+is on a per-inode basis. Locking of rgrps is on a per rgrp basis.
+
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt
index d0ec45ae4e7..44bd766f2e5 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs-rdma.txt
@@ -5,7 +5,7 @@
################################################################################
Author: NetApp and Open Grid Computing
- Date: April 15, 2008
+ Date: May 29, 2008
Table of Contents
~~~~~~~~~~~~~~~~~
@@ -60,16 +60,18 @@ Installation
The procedures described in this document have been tested with
distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
- - Install nfs-utils-1.1.1 or greater on the client
+ - Install nfs-utils-1.1.2 or greater on the client
- An NFS/RDMA mount point can only be obtained by using the mount.nfs
- command in nfs-utils-1.1.1 or greater. To see which version of mount.nfs
- you are using, type:
+ An NFS/RDMA mount point can be obtained by using the mount.nfs command in
+ nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
+ version with support for NFS/RDMA mounts, but for various reasons we
+ recommend using nfs-utils-1.1.2 or greater). To see which version of
+ mount.nfs you are using, type:
- > /sbin/mount.nfs -V
+ $ /sbin/mount.nfs -V
- If the version is less than 1.1.1 or the command does not exist,
- then you will need to install the latest version of nfs-utils.
+ If the version is less than 1.1.2 or the command does not exist,
+ you should install the latest version of nfs-utils.
Download the latest package from:
@@ -77,22 +79,33 @@ Installation
Uncompress the package and follow the installation instructions.
- If you will not be using GSS and NFSv4, the installation process
- can be simplified by disabling these features when running configure:
+ If you will not need the idmapper and gssd executables (you do not need
+ these to create an NFS/RDMA enabled mount command), the installation
+ process can be simplified by disabling these features when running
+ configure:
- > ./configure --disable-gss --disable-nfsv4
+ $ ./configure --disable-gss --disable-nfsv4
- For more information on this see the package's README and INSTALL files.
+ To build nfs-utils you will need the tcp_wrappers package installed. For
+ more information on this see the package's README and INSTALL files.
After building the nfs-utils package, there will be a mount.nfs binary in
the utils/mount directory. This binary can be used to initiate NFS v2, v3,
- or v4 mounts. To initiate a v4 mount, the binary must be called mount.nfs4.
- The standard technique is to create a symlink called mount.nfs4 to mount.nfs.
+ or v4 mounts. To initiate a v4 mount, the binary must be called
+ mount.nfs4. The standard technique is to create a symlink called
+ mount.nfs4 to mount.nfs.
- NOTE: mount.nfs and therefore nfs-utils-1.1.1 or greater is only needed
+ This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
+
+ $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
+
+ In this location, mount.nfs will be invoked automatically for NFS mounts
+ by the system mount commmand.
+
+ NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
on the NFS client machine. You do not need this specific version of
nfs-utils on the server. Furthermore, only the mount.nfs command from
- nfs-utils-1.1.1 is needed on the client.
+ nfs-utils-1.1.2 is needed on the client.
- Install a Linux kernel with NFS/RDMA
@@ -156,8 +169,8 @@ Check RDMA and NFS Setup
this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
card:
- > modprobe ib_mthca
- > modprobe ib_ipoib
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
If you are using InfiniBand, make sure there is a Subnet Manager (SM)
running on the network. If your IB switch has an embedded SM, you can
@@ -166,7 +179,7 @@ Check RDMA and NFS Setup
If an SM is running on your network, you should see the following:
- > cat /sys/class/infiniband/driverX/ports/1/state
+ $ cat /sys/class/infiniband/driverX/ports/1/state
4: ACTIVE
where driverX is mthca0, ipath5, ehca3, etc.
@@ -174,10 +187,10 @@ Check RDMA and NFS Setup
To further test the InfiniBand software stack, use IPoIB (this
assumes you have two IB hosts named host1 and host2):
- host1> ifconfig ib0 a.b.c.x
- host2> ifconfig ib0 a.b.c.y
- host1> ping a.b.c.y
- host2> ping a.b.c.x
+ host1$ ifconfig ib0 a.b.c.x
+ host2$ ifconfig ib0 a.b.c.y
+ host1$ ping a.b.c.y
+ host2$ ping a.b.c.x
For other device types, follow the appropriate procedures.
@@ -202,11 +215,11 @@ NFS/RDMA Setup
/vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
/vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
- The IP address(es) is(are) the client's IPoIB address for an InfiniBand HCA or the
- cleint's iWARP address(es) for an RNIC.
+ The IP address(es) is(are) the client's IPoIB address for an InfiniBand
+ HCA or the cleint's iWARP address(es) for an RNIC.
- NOTE: The "insecure" option must be used because the NFS/RDMA client does not
- use a reserved port.
+ NOTE: The "insecure" option must be used because the NFS/RDMA client does
+ not use a reserved port.
Each time a machine boots:
@@ -214,43 +227,45 @@ NFS/RDMA Setup
For InfiniBand using a Mellanox adapter:
- > modprobe ib_mthca
- > modprobe ib_ipoib
- > ifconfig ib0 a.b.c.d
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+ $ ifconfig ib0 a.b.c.d
NOTE: use unique addresses for the client and server
- Start the NFS server
- If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config),
- load the RDMA transport module:
+ If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA transport module:
- > modprobe svcrdma
+ $ modprobe svcrdma
- Regardless of how the server was built (module or built-in), start the server:
+ Regardless of how the server was built (module or built-in), start the
+ server:
- > /etc/init.d/nfs start
+ $ /etc/init.d/nfs start
or
- > service nfs start
+ $ service nfs start
Instruct the server to listen on the RDMA transport:
- > echo rdma 2050 > /proc/fs/nfsd/portlist
+ $ echo rdma 2050 > /proc/fs/nfsd/portlist
- On the client system
- If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config),
- load the RDMA client module:
+ If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA client module:
- > modprobe xprtrdma.ko
+ $ modprobe xprtrdma.ko
- Regardless of how the client was built (module or built-in), issue the mount.nfs command:
+ Regardless of how the client was built (module or built-in), use this
+ command to mount the NFS/RDMA server:
- > /path/to/your/mount.nfs <IPoIB-server-name-or-address>:/<export> /mnt -i -o rdma,port=2050
+ $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt
- To verify that the mount is using RDMA, run "cat /proc/mounts" and check the
- "proto" field for the given mount.
+ To verify that the mount is using RDMA, run "cat /proc/mounts" and check
+ the "proto" field for the given mount.
Congratulations! You're using NFS/RDMA!
diff --git a/Documentation/filesystems/omfs.txt b/Documentation/filesystems/omfs.txt
new file mode 100644
index 00000000000..1d0d41ff5c6
--- /dev/null
+++ b/Documentation/filesystems/omfs.txt
@@ -0,0 +1,106 @@
+Optimized MPEG Filesystem (OMFS)
+
+Overview
+========
+
+OMFS is a filesystem created by SonicBlue for use in the ReplayTV DVR
+and Rio Karma MP3 player. The filesystem is extent-based, utilizing
+block sizes from 2k to 8k, with hash-based directories. This
+filesystem driver may be used to read and write disks from these
+devices.
+
+Note, it is not recommended that this FS be used in place of a general
+filesystem for your own streaming media device. Native Linux filesystems
+will likely perform better.
+
+More information is available at:
+
+ http://linux-karma.sf.net/
+
+Various utilities, including mkomfs and omfsck, are included with
+omfsprogs, available at:
+
+ http://bobcopeland.com/karma/
+
+Instructions are included in its README.
+
+Options
+=======
+
+OMFS supports the following mount-time options:
+
+ uid=n - make all files owned by specified user
+ gid=n - make all files owned by specified group
+ umask=xxx - set permission umask to xxx
+ fmask=xxx - set umask to xxx for files
+ dmask=xxx - set umask to xxx for directories
+
+Disk format
+===========
+
+OMFS discriminates between "sysblocks" and normal data blocks. The sysblock
+group consists of super block information, file metadata, directory structures,
+and extents. Each sysblock has a header containing CRCs of the entire
+sysblock, and may be mirrored in successive blocks on the disk. A sysblock may
+have a smaller size than a data block, but since they are both addressed by the
+same 64-bit block number, any remaining space in the smaller sysblock is
+unused.
+
+Sysblock header information:
+
+struct omfs_header {
+ __be64 h_self; /* FS block where this is located */
+ __be32 h_body_size; /* size of useful data after header */
+ __be16 h_crc; /* crc-ccitt of body_size bytes */
+ char h_fill1[2];
+ u8 h_version; /* version, always 1 */
+ char h_type; /* OMFS_INODE_X */
+ u8 h_magic; /* OMFS_IMAGIC */
+ u8 h_check_xor; /* XOR of header bytes before this */
+ __be32 h_fill2;
+};
+
+Files and directories are both represented by omfs_inode:
+
+struct omfs_inode {
+ struct omfs_header i_head; /* header */
+ __be64 i_parent; /* parent containing this inode */
+ __be64 i_sibling; /* next inode in hash bucket */
+ __be64 i_ctime; /* ctime, in milliseconds */
+ char i_fill1[35];
+ char i_type; /* OMFS_[DIR,FILE] */
+ __be32 i_fill2;
+ char i_fill3[64];
+ char i_name[OMFS_NAMELEN]; /* filename */
+ __be64 i_size; /* size of file, in bytes */
+};
+
+Directories in OMFS are implemented as a large hash table. Filenames are
+hashed then prepended into the bucket list beginning at OMFS_DIR_START.
+Lookup requires hashing the filename, then seeking across i_sibling pointers
+until a match is found on i_name. Empty buckets are represented by block
+pointers with all-1s (~0).
+
+A file is an omfs_inode structure followed by an extent table beginning at
+OMFS_EXTENT_START:
+
+struct omfs_extent_entry {
+ __be64 e_cluster; /* start location of a set of blocks */
+ __be64 e_blocks; /* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+ __be64 e_next; /* next extent table location */
+ __be32 e_extent_count; /* total # extents in this table */
+ __be32 e_fill;
+ struct omfs_extent_entry e_entry; /* start of extent entries */
+};
+
+Each extent holds the block offset followed by number of blocks allocated to
+the extent. The final extent in each table is a terminator with e_cluster
+being ~0 and e_blocks being ones'-complement of the total number of blocks
+in the table.
+
+If this table overflows, a continuation inode is written and pointed to by
+e_next. These have a header but lack the rest of the inode structure.
+
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dbc3c6a3650..64557821ee5 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -296,6 +296,7 @@ Table 1-4: Kernel info in /proc
uptime System uptime
version Kernel version
video bttv info of video resources (2.4)
+ vmallocinfo Show vmalloced areas
..............................................................................
You can, for example, check which interrupts are currently in use and what
@@ -380,28 +381,35 @@ i386 and x86_64 platforms support the new IRQ vector displays.
Of some interest is the introduction of the /proc/irq directory to 2.4.
It could be used to set IRQ to CPU affinity, this means that you can "hook" an
IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
-irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask
+irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and
+prof_cpu_mask.
For example
> ls /proc/irq/
0 10 12 14 16 18 2 4 6 8 prof_cpu_mask
- 1 11 13 15 17 19 3 5 7 9
+ 1 11 13 15 17 19 3 5 7 9 default_smp_affinity
> ls /proc/irq/0/
smp_affinity
-The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ
-is the same by default:
+smp_affinity is a bitmask, in which you can specify which CPUs can handle the
+IRQ, you can set it by doing:
- > cat /proc/irq/0/smp_affinity
- ffffffff
+ > echo 1 > /proc/irq/10/smp_affinity
+
+This means that only the first CPU will handle the IRQ, but you can also echo
+5 which means that only the first and fourth CPU can handle the IRQ.
+
+The contents of each smp_affinity file is the same by default:
-It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can
-set it by doing:
+ > cat /proc/irq/0/smp_affinity
+ ffffffff
- > echo 1 > /proc/irq/prof_cpu_mask
+The default_smp_affinity mask applies to all non-active IRQs, which are the
+IRQs which have not yet been allocated/activated, and hence which lack a
+/proc/irq/[0-9]* directory.
-This means that only the first CPU will handle the IRQ, but you can also echo 5
-which means that only the first and fourth CPU can handle the IRQ.
+prof_cpu_mask specifies which CPUs are to be profiled by the system wide
+profiler. Default value is ffffffff (all cpus).
The way IRQs are routed is handled by the IO-APIC, and it's Round Robin
between all the CPUs which are allowed to handle it. As usual the kernel has
@@ -550,6 +558,49 @@ VmallocTotal: total size of vmalloc memory area
VmallocUsed: amount of vmalloc area which is used
VmallocChunk: largest contigious block of vmalloc area which is free
+..............................................................................
+
+vmallocinfo:
+
+Provides information about vmalloced/vmaped areas. One line per area,
+containing the virtual address range of the area, size in bytes,
+caller information of the creator, and optional information depending
+on the kind of area :
+
+ pages=nr number of pages
+ phys=addr if a physical address was specified
+ ioremap I/O mapping (ioremap() and friends)
+ vmalloc vmalloc() area
+ vmap vmap()ed pages
+ user VM_USERMAP area
+ vpages buffer for pages pointers was vmalloced (huge area)
+ N<node>=nr (Only on NUMA kernels)
+ Number of pages allocated on memory node <node>
+
+> cat /proc/vmallocinfo
+0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204 ...
+ /0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128
+0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204 ...
+ /0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
+0xffffc20000302000-0xffffc20000304000 8192 acpi_tb_verify_table+0x21/0x4f...
+ phys=7fee8000 ioremap
+0xffffc20000304000-0xffffc20000307000 12288 acpi_tb_verify_table+0x21/0x4f...
+ phys=7fee7000 ioremap
+0xffffc2000031d000-0xffffc2000031f000 8192 init_vdso_vars+0x112/0x210
+0xffffc2000031f000-0xffffc2000032b000 49152 cramfs_uncompress_init+0x2e ...
+ /0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3
+0xffffc2000033a000-0xffffc2000033d000 12288 sys_swapon+0x640/0xac0 ...
+ pages=2 vmalloc N1=2
+0xffffc20000347000-0xffffc2000034c000 20480 xt_alloc_table_info+0xfe ...
+ /0x130 [x_tables] pages=4 vmalloc N0=4
+0xffffffffa0000000-0xffffffffa000f000 61440 sys_init_module+0xc27/0x1d00 ...
+ pages=14 vmalloc N2=14
+0xffffffffa000f000-0xffffffffa0014000 20480 sys_init_module+0xc27/0x1d00 ...
+ pages=4 vmalloc N1=4
+0xffffffffa0014000-0xffffffffa0017000 12288 sys_init_module+0xc27/0x1d00 ...
+ pages=2 vmalloc N1=2
+0xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ...
+ pages=10 vmalloc N0=10
1.3 IDE devices in /proc/ide
----------------------------
@@ -880,7 +931,7 @@ group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
stats stream_req
mb_groups:
-This file gives the details of mutiblock allocator buddy cache of free blocks
+This file gives the details of multiblock allocator buddy cache of free blocks
mb_history:
Multiblock allocation history.
@@ -1423,7 +1474,7 @@ used because pages_free(1355) is smaller than watermark + protection[2]
normal page requirement. If requirement is DMA zone(index=0), protection[0]
(=0) is used.
-zone[i]'s protection[j] is calculated by following exprssion.
+zone[i]'s protection[j] is calculated by following expression.
(i < j):
zone[i]->protection[j]
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
index 094f2d2f38b..510b722667a 100644
--- a/Documentation/filesystems/relay.txt
+++ b/Documentation/filesystems/relay.txt
@@ -294,6 +294,16 @@ user-defined data with a channel, and is immediately available
(including in create_buf_file()) via chan->private_data or
buf->chan->private_data.
+Buffer-only channels
+--------------------
+
+These channels have no files associated and can be created with
+relay_open(NULL, NULL, ...). Such channels are useful in scenarios such
+as when doing early tracing in the kernel, before the VFS is up. In these
+cases, one may open a buffer-only channel and then call
+relay_late_setup_files() when the kernel is ready to handle files,
+to expose the buffered data to the userspace.
+
Channel 'modes'
---------------
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 5daa2aaec2c..68ef48839c0 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -36,6 +36,7 @@ files, each with their own function.
local_cpus nearby CPU mask (cpumask, ro)
resource PCI resource host addresses (ascii, ro)
resource0..N PCI resource N, if present (binary, mmap)
+ resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap)
rom PCI ROM resource, if present (binary, ro)
subsystem_device PCI subsystem device (ascii, ro)
subsystem_vendor PCI subsystem vendor (ascii, ro)
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 7f27b8f840d..9e9c348275a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -248,6 +248,7 @@ The top level sysfs directory looks like:
block/
bus/
class/
+dev/
devices/
firmware/
net/
@@ -274,6 +275,11 @@ fs/ contains a directory for some filesystems. Currently each
filesystem wanting to export attributes must create its own hierarchy
below fs/ (see ./fuse.txt for an example).
+dev/ contains two directories char/ and block/. Inside these two
+directories there are symlinks named <major>:<minor>. These symlinks
+point to the sysfs directory for the given device. /sys/dev provides a
+quick way to lookup the sysfs interface for a device from the result of
+a stat(2) operation.
More information can driver-model specific features can be found in
Documentation/driver-model/.
diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt
new file mode 100644
index 00000000000..540e9e7f59c
--- /dev/null
+++ b/Documentation/filesystems/ubifs.txt
@@ -0,0 +1,164 @@
+Introduction
+=============
+
+UBIFS file-system stands for UBI File System. UBI stands for "Unsorted
+Block Images". UBIFS is a flash file system, which means it is designed
+to work with flash devices. It is important to understand, that UBIFS
+is completely different to any traditional file-system in Linux, like
+Ext2, XFS, JFS, etc. UBIFS represents a separate class of file-systems
+which work with MTD devices, not block devices. The other Linux
+file-system of this class is JFFS2.
+
+To make it more clear, here is a small comparison of MTD devices and
+block devices.
+
+1 MTD devices represent flash devices and they consist of eraseblocks of
+ rather large size, typically about 128KiB. Block devices consist of
+ small blocks, typically 512 bytes.
+2 MTD devices support 3 main operations - read from some offset within an
+ eraseblock, write to some offset within an eraseblock, and erase a whole
+ eraseblock. Block devices support 2 main operations - read a whole
+ block and write a whole block.
+3 The whole eraseblock has to be erased before it becomes possible to
+ re-write its contents. Blocks may be just re-written.
+4 Eraseblocks become worn out after some number of erase cycles -
+ typically 100K-1G for SLC NAND and NOR flashes, and 1K-10K for MLC
+ NAND flashes. Blocks do not have the wear-out property.
+5 Eraseblocks may become bad (only on NAND flashes) and software should
+ deal with this. Blocks on hard drives typically do not become bad,
+ because hardware has mechanisms to substitute bad blocks, at least in
+ modern LBA disks.
+
+It should be quite obvious why UBIFS is very different to traditional
+file-systems.
+
+UBIFS works on top of UBI. UBI is a separate software layer which may be
+found in drivers/mtd/ubi. UBI is basically a volume management and
+wear-leveling layer. It provides so called UBI volumes which is a higher
+level abstraction than a MTD device. The programming model of UBI devices
+is very similar to MTD devices - they still consist of large eraseblocks,
+they have read/write/erase operations, but UBI devices are devoid of
+limitations like wear and bad blocks (items 4 and 5 in the above list).
+
+In a sense, UBIFS is a next generation of JFFS2 file-system, but it is
+very different and incompatible to JFFS2. The following are the main
+differences.
+
+* JFFS2 works on top of MTD devices, UBIFS depends on UBI and works on
+ top of UBI volumes.
+* JFFS2 does not have on-media index and has to build it while mounting,
+ which requires full media scan. UBIFS maintains the FS indexing
+ information on the flash media and does not require full media scan,
+ so it mounts many times faster than JFFS2.
+* JFFS2 is a write-through file-system, while UBIFS supports write-back,
+ which makes UBIFS much faster on writes.
+
+Similarly to JFFS2, UBIFS supports on-the-flight compression which makes
+it possible to fit quite a lot of data to the flash.
+
+Similarly to JFFS2, UBIFS is tolerant of unclean reboots and power-cuts.
+It does not need stuff like ckfs.ext2. UBIFS automatically replays its
+journal and recovers from crashes, ensuring that the on-flash data
+structures are consistent.
+
+UBIFS scales logarithmically (most of the data structures it uses are
+trees), so the mount time and memory consumption do not linearly depend
+on the flash size, like in case of JFFS2. This is because UBIFS
+maintains the FS index on the flash media. However, UBIFS depends on
+UBI, which scales linearly. So overall UBI/UBIFS stack scales linearly.
+Nevertheless, UBI/UBIFS scales considerably better than JFFS2.
+
+The authors of UBIFS believe, that it is possible to develop UBI2 which
+would scale logarithmically as well. UBI2 would support the same API as UBI,
+but it would be binary incompatible to UBI. So UBIFS would not need to be
+changed to use UBI2
+
+
+Mount options
+=============
+
+(*) == default.
+
+norm_unmount (*) commit on unmount; the journal is committed
+ when the file-system is unmounted so that the
+ next mount does not have to replay the journal
+ and it becomes very fast;
+fast_unmount do not commit on unmount; this option makes
+ unmount faster, but the next mount slower
+ because of the need to replay the journal.
+
+
+Quick usage instructions
+========================
+
+The UBI volume to mount is specified using "ubiX_Y" or "ubiX:NAME" syntax,
+where "X" is UBI device number, "Y" is UBI volume number, and "NAME" is
+UBI volume name.
+
+Mount volume 0 on UBI device 0 to /mnt/ubifs:
+$ mount -t ubifs ubi0_0 /mnt/ubifs
+
+Mount "rootfs" volume of UBI device 0 to /mnt/ubifs ("rootfs" is volume
+name):
+$ mount -t ubifs ubi0:rootfs /mnt/ubifs
+
+The following is an example of the kernel boot arguments to attach mtd0
+to UBI and mount volume "rootfs":
+ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs
+
+
+Module Parameters for Debugging
+===============================
+
+When UBIFS has been compiled with debugging enabled, there are 3 module
+parameters that are available to control aspects of testing and debugging.
+The parameters are unsigned integers where each bit controls an option.
+The parameters are:
+
+debug_msgs Selects which debug messages to display, as follows:
+
+ Message Type Flag value
+
+ General messages 1
+ Journal messages 2
+ Mount messages 4
+ Commit messages 8
+ LEB search messages 16
+ Budgeting messages 32
+ Garbage collection messages 64
+ Tree Node Cache (TNC) messages 128
+ LEB properties (lprops) messages 256
+ Input/output messages 512
+ Log messages 1024
+ Scan messages 2048
+ Recovery messages 4096
+
+debug_chks Selects extra checks that UBIFS can do while running:
+
+ Check Flag value
+
+ General checks 1
+ Check Tree Node Cache (TNC) 2
+ Check indexing tree size 4
+ Check orphan area 8
+ Check old indexing tree 16
+ Check LEB properties (lprops) 32
+ Check leaf nodes and inodes 64
+
+debug_tsts Selects a mode of testing, as follows:
+
+ Test mode Flag value
+
+ Force in-the-gaps method 2
+ Failure mode for recovery testing 4
+
+For example, set debug_msgs to 5 to display General messages and Mount
+messages.
+
+
+References
+==========
+
+UBIFS documentation and FAQ/HOWTO at the MTD web site:
+http://www.linux-mtd.infradead.org/doc/ubifs.html
+http://www.linux-mtd.infradead.org/faq/ubifs.html
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 2d5e1e582e1..bbac4f1d905 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -96,6 +96,14 @@ shortname=lower|win95|winnt|mixed
emulate the Windows 95 rule for create.
Default setting is `lower'.
+tz=UTC -- Interpret timestamps as UTC rather than local time.
+ This option disables the conversion of timestamps
+ between local time (as used by Windows on FAT) and UTC
+ (which Linux uses internally). This is particuluarly
+ useful when mounting devices (like digital cameras)
+ that are set to UTC in order to avoid the pitfalls of
+ local time.
+
<bool>: 0,1,yes,no,true,false
TODO
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 81e5be6e6e3..c4d348dabe9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -143,7 +143,7 @@ struct file_system_type {
The get_sb() method has the following arguments:
- struct file_system_type *fs_type: decribes the filesystem, partly initialized
+ struct file_system_type *fs_type: describes the filesystem, partly initialized
by the specific filesystem code
int flags: mount flags
@@ -205,7 +205,6 @@ struct super_operations {
void (*dirty_inode) (struct inode *);
int (*write_inode) (struct inode *, int);
- void (*put_inode) (struct inode *);
void (*drop_inode) (struct inode *);
void (*delete_inode) (struct inode *);
void (*put_super) (struct super_block *);
@@ -246,9 +245,6 @@ or bottom half).
inode to disc. The second parameter indicates whether the write
should be synchronous or not, not all filesystems check this flag.
- put_inode: called when the VFS inode is removed from the inode
- cache.
-
drop_inode: called when the last access to the inode is dropped,
with the inode_lock spinlock held.
@@ -899,9 +895,9 @@ struct dentry_operations {
iput() yourself
d_dname: called when the pathname of a dentry should be generated.
- Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay
+ Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay
pathname generation. (Instead of doing it when dentry is created,
- its done only when the path is needed.). Real filesystems probably
+ it's done only when the path is needed.). Real filesystems probably
dont want to use it, because their dentries are present in global
dcache hash, so their hash should be an invariant. As no lock is
held, d_dname() should not try to modify the dentry itself, unless