From b5cbc369db39d9080f4932db8607aea1e1654d4d Mon Sep 17 00:00:00 2001 From: Greg Banks Date: Thu, 26 Mar 2009 17:45:27 +1100 Subject: Document /proc/fs/nfsd/pool_stats Document the format and semantics of the /proc/fs/nfsd/pool_stats file. Signed-off-by: Greg Banks Signed-off-by: J. Bruce Fields --- Documentation/filesystems/knfsd-stats.txt | 159 ++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 Documentation/filesystems/knfsd-stats.txt (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt new file mode 100644 index 00000000000..64ced5149d3 --- /dev/null +++ b/Documentation/filesystems/knfsd-stats.txt @@ -0,0 +1,159 @@ + +Kernel NFS Server Statistics +============================ + +This document describes the format and semantics of the statistics +which the kernel NFS server makes available to userspace. These +statistics are available in several text form pseudo files, each of +which is described separately below. + +In most cases you don't need to know these formats, as the nfsstat(8) +program from the nfs-utils distribution provides a helpful command-line +interface for extracting and printing them. + +All the files described here are formatted as a sequence of text lines, +separated by newline '\n' characters. Lines beginning with a hash +'#' character are comments intended for humans and should be ignored +by parsing routines. All other lines contain a sequence of fields +separated by whitespace. + +/proc/fs/nfsd/pool_stats +------------------------ + +This file is available in kernels from 2.6.30 onwards, if the +/proc/fs/nfsd filesystem is mounted (it almost always should be). + +The first line is a comment which describes the fields present in +all the other lines. The other lines present the following data as +a sequence of unsigned decimal numeric fields. One line is shown +for each NFS thread pool. + +All counters are 64 bits wide and wrap naturally. There is no way +to zero these counters, instead applications should do their own +rate conversion. + +pool + The id number of the NFS thread pool to which this line applies. + This number does not change. + + Thread pool ids are a contiguous set of small integers starting + at zero. The maximum value depends on the thread pool mode, but + currently cannot be larger than the number of CPUs in the system. + Note that in the default case there will be a single thread pool + which contains all the nfsd threads and all the CPUs in the system, + and thus this file will have a single line with a pool id of "0". + +packets-arrived + Counts how many NFS packets have arrived. More precisely, this + is the number of times that the network stack has notified the + sunrpc server layer that new data may be available on a transport + (e.g. an NFS or UDP socket or an NFS/RDMA endpoint). + + Depending on the NFS workload patterns and various network stack + effects (such as Large Receive Offload) which can combine packets + on the wire, this may be either more or less than the number + of NFS calls received (which statistic is available elsewhere). + However this is a more accurate and less workload-dependent measure + of how much CPU load is being placed on the sunrpc server layer + due to NFS network traffic. + +sockets-enqueued + Counts how many times an NFS transport is enqueued to wait for + an nfsd thread to service it, i.e. no nfsd thread was considered + available. + + The circumstance this statistic tracks indicates that there was NFS + network-facing work to be done but it couldn't be done immediately, + thus introducing a small delay in servicing NFS calls. The ideal + rate of change for this counter is zero; significantly non-zero + values may indicate a performance limitation. + + This can happen either because there are too few nfsd threads in the + thread pool for the NFS workload (the workload is thread-limited), + or because the NFS workload needs more CPU time than is available in + the thread pool (the workload is CPU-limited). In the former case, + configuring more nfsd threads will probably improve the performance + of the NFS workload. In the latter case, the sunrpc server layer is + already choosing not to wake idle nfsd threads because there are too + many nfsd threads which want to run but cannot, so configuring more + nfsd threads will make no difference whatsoever. The overloads-avoided + statistic (see below) can be used to distinguish these cases. + +threads-woken + Counts how many times an idle nfsd thread is woken to try to + receive some data from an NFS transport. + + This statistic tracks the circumstance where incoming + network-facing NFS work is being handled quickly, which is a good + thing. The ideal rate of change for this counter will be close + to but less than the rate of change of the packets-arrived counter. + +overloads-avoided + Counts how many times the sunrpc server layer chose not to wake an + nfsd thread, despite the presence of idle nfsd threads, because + too many nfsd threads had been recently woken but could not get + enough CPU time to actually run. + + This statistic counts a circumstance where the sunrpc layer + heuristically avoids overloading the CPU scheduler with too many + runnable nfsd threads. The ideal rate of change for this counter + is zero. Significant non-zero values indicate that the workload + is CPU limited. Usually this is associated with heavy CPU usage + on all the CPUs in the nfsd thread pool. + + If a sustained large overloads-avoided rate is detected on a pool, + the top(1) utility should be used to check for the following + pattern of CPU usage on all the CPUs associated with the given + nfsd thread pool. + + - %us ~= 0 (as you're *NOT* running applications on your NFS server) + + - %wa ~= 0 + + - %id ~= 0 + + - %sy + %hi + %si ~= 100 + + If this pattern is seen, configuring more nfsd threads will *not* + improve the performance of the workload. If this patten is not + seen, then something more subtle is wrong. + +threads-timedout + Counts how many times an nfsd thread triggered an idle timeout, + i.e. was not woken to handle any incoming network packets for + some time. + + This statistic counts a circumstance where there are more nfsd + threads configured than can be used by the NFS workload. This is + a clue that the number of nfsd threads can be reduced without + affecting performance. Unfortunately, it's only a clue and not + a strong indication, for a couple of reasons: + + - Currently the rate at which the counter is incremented is quite + slow; the idle timeout is 60 minutes. Unless the NFS workload + remains constant for hours at a time, this counter is unlikely + to be providing information that is still useful. + + - It is usually a wise policy to provide some slack, + i.e. configure a few more nfsds than are currently needed, + to allow for future spikes in load. + + +Note that incoming packets on NFS transports will be dealt with in +one of three ways. An nfsd thread can be woken (threads-woken counts +this case), or the transport can be enqueued for later attention +(sockets-enqueued counts this case), or the packet can be temporarily +deferred because the transport is currently being used by an nfsd +thread. This last case is not very interesting and is not explicitly +counted, but can be inferred from the other counters thus: + +packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken ) + + +More +---- +Descriptions of the other statistics file should go here. + + +Greg Banks +26 Mar 2009 -- cgit v1.2.3-70-g09d2 From b8523c40d57f5996a467f83825cb05583a5a7da4 Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Mon, 9 Feb 2009 17:02:34 +0300 Subject: Staging: pohmelfs: documentation. This patch includes POHMELFS design and implementation description. Separate file includes mount options, default parameters and usage examples. Signed-off-by: Eveniy Polyakov Signed-off-by: Greg Kroah-Hartman --- .../filesystems/pohmelfs/design_notes.txt | 70 +++++++ Documentation/filesystems/pohmelfs/info.txt | 86 ++++++++ .../filesystems/pohmelfs/network_protocol.txt | 227 +++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 Documentation/filesystems/pohmelfs/design_notes.txt create mode 100644 Documentation/filesystems/pohmelfs/info.txt create mode 100644 Documentation/filesystems/pohmelfs/network_protocol.txt (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt new file mode 100644 index 00000000000..6d6db60d567 --- /dev/null +++ b/Documentation/filesystems/pohmelfs/design_notes.txt @@ -0,0 +1,70 @@ +POHMELFS: Parallel Optimized Host Message Exchange Layered File System. + + Evgeniy Polyakov + +Homepage: http://www.ioremap.net/projects/pohmelfs + +POHMELFS first began as a network filesystem with coherent local data and +metadata caches but is now evolving into a parallel distributed filesystem. + +Main features of this FS include: + * Locally coherent cache for data and metadata with (potentially) byte-range locks. + Since all Linux filesystems lock the whole inode during writing, algorithm + is very simple and does not use byte-ranges, although they are sent in + locking messages. + * Completely async processing of all events except creation of hard and symbolic + links, and rename events. + Object creation and data reading and writing are processed asynchronously. + * Flexible object architecture optimized for network processing. + Ability to create long paths to objects and remove arbitrarily huge + directories with a single network command. + (like removing the whole kernel tree via a single network command). + * Very high performance. + * Fast and scalable multithreaded userspace server. Being in userspace it works + with any underlying filesystem and still is much faster than async in-kernel NFS one. + * Client is able to switch between different servers (if one goes down, client + automatically reconnects to second and so on). + * Transactions support. Full failover for all operations. + Resending transactions to different servers on timeout or error. + * Read request (data read, directory listing, lookup requests) balancing between multiple servers. + * Write requests are replicated to multiple servers and completed only when all of them are acked. + * Ability to add and/or remove servers from the working set at run-time. + * Strong authentification and possible data encryption in network channel. + * Extended attributes support. + +POHMELFS is based on transactions, which are potentially long-standing objects that live +in the client's memory. Each transaction contains all the information needed to process a given +command (or set of commands, which is frequently used during data writing: single transactions +can contain creation and data writing commands). Transactions are committed by all the servers +to which they are sent and, in case of failures, are eventually resent or dropped with an error. +For example, reading will return an error if no servers are available. + +POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is +possible to detach replies from requests and, if the command requires data to be received, the +caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different +servers and async threads will pick up replies in parallel, find appropriate transactions in the +system and put the data where it belongs (like the page or inode cache). + +The main feature of POHMELFS is writeback data and the metadata cache. +Only a few non-performance critical operations use the write-through cache and +are synchronous: hard and symbolic link creation, and object rename. Creation, +removal of objects and data writing are asynchronous and are sent to +the server during system writeback. Only one writer at a time is allowed for any +given inode, which is guarded by an appropriate locking protocol. +Because of this feature, POHMELFS is extremely fast at metadata intensive +workloads and can fully utilize the bandwidth to the servers when doing bulk +data transfers. + +POHMELFS clients operate with a working set of servers and are capable of balancing read-only +operations (like lookups or directory listings) between them. +Administrators can add or remove servers from the set at run-time via special commands (described +in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers. + +POHMELFS is capable of full data channel encryption and/or strong crypto hashing. +One can select any kernel supported cipher, encryption mode, hash type and operation mode +(hmac or digest). It is also possible to use both or neither (default). Crypto configuration +is checked during mount time and, if the server does not support it, appropriate capabilities +will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified). +Crypto performance heavily depends on the number of crypto threads, which asynchronously perform +crypto operations and send the resulting data to server or submit it up the stack. This number +can be controlled via a mount option. diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt new file mode 100644 index 00000000000..4e3d5015708 --- /dev/null +++ b/Documentation/filesystems/pohmelfs/info.txt @@ -0,0 +1,86 @@ +POHMELFS usage information. + +Mount options: +idx=%u + Each mountpoint is associated with a special index via this option. + Administrator can add or remove servers from the given index, so all mounts, + which were attached to it, are updated. + Default it is 0. + +trans_scan_timeout=%u + This timeout, expressed in milliseconds, specifies time to scan transaction + trees looking for stale requests, which have to be resent, or if number of + retries exceed specified limit, dropped with error. + Default is 5 seconds. + +drop_scan_timeout=%u + Internal timeout, expressed in milliseconds, which specifies how frequently + inodes marked to be dropped are freed. It also specifies how frequently + the system checks that servers have to be added or removed from current working set. + Default is 1 second. + +wait_on_page_timeout=%u + Number of milliseconds to wait for reply from remote server for data reading command. + If this timeout is exceeded, reading returns an error. + Default is 5 seconds. + +trans_retries=%u + This is the number of times that a transaction will be resent to a server that did + not answer for the last @trans_scan_timeout milliseconds. + When the number of resends exceeds this limit, the transaction is completed with error. + Default is 5 resends. + +crypto_thread_num=%u + Number of crypto processing threads. Threads are used both for RX and TX traffic. + Default is 2, or no threads if crypto operations are not supported. + +trans_max_pages=%u + Maximum number of pages in a single transaction. This parameter also controls + the number of pages, allocated for crypto processing (each crypto thread has + pool of pages, the number of which is equal to 'trans_max_pages'. + Default is 100 pages. + +crypto_fail_unsupported + If specified, mount will fail if the server does not support requested crypto operations. + By default mount will disable non-matching crypto operations. + +mcache_timeout=%u + Maximum number of milliseconds to wait for the mcache objects to be processed. + Mcache includes locks (given lock should be granted by server), attributes (they should be + fully received in the given timeframe). + Default is 5 seconds. + +Usage examples. + +Add (or remove if it already exists) server server1.net:1025 into the working set with index $idx +with appropriate hash algorithm and key file and cipher algorithm, mode and key file: +$cfg -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key + +Mount filesystem with given index $idx to /mnt mountpoint. +Client will connect to all servers specified in the working set via previous command: +mount -t pohmel -o idx=$idx q /mnt + +One can add or remove servers from working set after mounting too. + + +Server installation. + +Creating a server, which listens at port 1025 and 0.0.0.0 address. +Working root directory (note, that server chroots there, so you have to have appropriate permissions) +is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there +are appropriate key files. +Number of working threads is set to 10. + +# ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key + + -A 6 - listen on ipv6 address. Default: Disabled. + -r root - path to root directory. Default: /tmp. + -a addr - listen address. Default: 0.0.0.0. + -p port - listen port. Default: 1025. + -w workers - number of workers per connected client. Default: 1. + -K file - hash key size. Default: none. + -k file - cipher key size. Default: none. + -h - this help. + +Number of worker threads specifies how many workers will be created for each client. +Bulk single-client transafers usually are better handled with smaller number (like 1-3). diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt new file mode 100644 index 00000000000..40ea6c295af --- /dev/null +++ b/Documentation/filesystems/pohmelfs/network_protocol.txt @@ -0,0 +1,227 @@ +POHMELFS network protocol. + +Basic structure used in network communication is following command: + +struct netfs_cmd +{ + __u16 cmd; /* Command number */ + __u16 csize; /* Attached crypto information size */ + __u16 cpad; /* Attached padding size */ + __u16 ext; /* External flags */ + __u32 size; /* Size of the attached data */ + __u32 trans; /* Transaction id */ + __u64 id; /* Object ID to operate on. Used for feedback.*/ + __u64 start; /* Start of the object. */ + __u64 iv; /* IV sequence */ + __u8 data[0]; +}; + +Commands can be embedded into transaction command (which in turn has own command), +so one can extend protocol as needed without breaking backward compatibility as long +as old commands are supported. All string lengths include tail 0 byte. + +All commans are transfered over the network in big-endian. CPU endianess is used at the end peers. + +@cmd - command number, which specifies command to be processed. Following + commands are used currently: + + NETFS_READDIR = 1, /* Read directory for given inode number */ + NETFS_READ_PAGE, /* Read data page from the server */ + NETFS_WRITE_PAGE, /* Write data page to the server */ + NETFS_CREATE, /* Create directory entry */ + NETFS_REMOVE, /* Remove directory entry */ + NETFS_LOOKUP, /* Lookup single object */ + NETFS_LINK, /* Create a link */ + NETFS_TRANS, /* Transaction */ + NETFS_OPEN, /* Open intent */ + NETFS_INODE_INFO, /* Metadata cache coherency synchronization message */ + NETFS_PAGE_CACHE, /* Page cache invalidation message */ + NETFS_READ_PAGES, /* Read multiple contiguous pages in one go */ + NETFS_RENAME, /* Rename object */ + NETFS_CAPABILITIES, /* Capabilities of the client, for example supported crypto */ + NETFS_LOCK, /* Distributed lock message */ + NETFS_XATTR_SET, /* Set extended attribute */ + NETFS_XATTR_GET, /* Get extended attribute */ + +@ext - external flags. Used by different commands to specify some extra arguments + like partial size of the embedded objects or creation flags. + +@size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached, + but size of the requested data is incorporated here. It does not include size of the command + header (struct netfs_cmd) itself. + +@id - id of the object this command operates on. Each command can use it for own purpose. + +@start - start of the object this command operates on. Each command can use it for own purpose. + +@csize, @cpad - size and padding size of the (attached if needed) crypto information. + +Command specifications. + +@NETFS_READDIR +This command is used to sync content of the remote dir to the client. + +@ext - length of the path to object. +@size - the same. +@id - local inode number of the directory to read. +@start - zero. + + +@NETFS_READ_PAGE +This command is used to read data from remote server. +Data size does not exceed local page cache size. + +@id - inode number. +@start - first byte offset. +@size - number of bytes to read plus length of the path to object. +@ext - object path length. + + +@NETFS_CREATE +Used to create object. +It does not require that all directories on top of the object were +already created, it will create them automatically. Each object has +associated @netfs_path_entry data structure, which contains creation +mode (permissions and type) and length of the name as long as name itself. + +@start - 0 +@size - size of the all data structures needed to create a path +@id - local inode number +@ext - 0 + + +@NETFS_REMOVE +Used to remove object. + +@ext - length of the path to object. +@size - the same. +@id - local inode number. +@start - zero. + + +@NETFS_LOOKUP +Lookup information about object on server. + +@ext - length of the path to object. +@size - the same. +@id - local inode number of the directory to look object in. +@start - local inode number of the object to look at. + + +@NETFS_LINK +Create hard of symlink. +Command is sent as "object_path|target_path". + +@size - size of the above string. +@id - parent local inode number. +@start - 1 for symlink, 0 for hardlink. +@ext - size of the "object_path" above. + + +@NETFS_TRANS +Transaction header. + +@size - incorporates all embedded command sizes including theirs header sizes. +@start - transaction generation number - unique id used to find transaction. +@ext - transaction flags. Unused at the moment. +@id - 0. + + +@NETFS_OPEN +Open intent for given transaction. + +@id - local inode number. +@start - 0. +@size - path length to the object. +@ext - open flags (O_RDWR and so on). + + +@NETFS_INODE_INFO +Metadata update command. +It is sent to servers when attributes of the object are changed and received +when data or metadata were updated. It operates with the following structure: + +struct netfs_inode_info +{ + unsigned int mode; + unsigned int nlink; + unsigned int uid; + unsigned int gid; + unsigned int blocksize; + unsigned int padding; + __u64 ino; + __u64 blocks; + __u64 rdev; + __u64 size; + __u64 version; +}; + +It effectively mirrors stat(2) returned data. + + +@ext - path length to the object. +@size - the same plus size of the netfs_inode_info structure. +@id - local inode number. +@start - 0. + + +@NETFS_PAGE_CACHE +Command is only received by clients. It contains information about +page to be marked as not up-to-date. + +@id - client's inode number. +@start - last byte of the page to be invalidated. If it is not equal to + current inode size, it will be vmtruncated(). +@size - 0 +@ext - 0 + + +@NETFS_READ_PAGES +Used to read multiple contiguous pages in one go. + +@start - first byte of the contiguous region to read. +@size - contains of two fields: lower 8 bits are used to represent page cache shift + used by client, another 3 bytes are used to get number of pages. +@id - local inode number. +@ext - path length to the object. + + +@NETFS_RENAME +Used to rename object. +Attached data is formed into following string: "old_path|new_path". + +@id - local inode number. +@start - parent inode number. +@size - length of the above string. +@ext - length of the old path part. + + +@NETFS_CAPABILITIES +Used to exchange crypto capabilities with server. +If crypto capabilities are not supported by server, then client will disable it +or fail (if 'crypto_fail_unsupported' mount options was specified). + +@id - superblock index. Used to specify crypto information for group of servers. +@size - size of the attached capabilities structure. +@start - 0. +@size - 0. +@scsize - 0. + +@NETFS_LOCK +Used to send lock request/release messages. Although it sends byte range request +and is capable of flushing pages based on that, it is not used, since all Linux +filesystems lock the whole inode. + +@id - lock generation number. +@start - start of the locked range. +@size - size of the locked range. +@ext - lock type: read/write. Not used actually. 15'th bit is used to determine, + if it is lock request (1) or release (0). + +@NETFS_XATTR_SET +@NETFS_XATTR_GET +Used to set/get extended attributes for given inode. +@id - attribute generation number or xattr setting type +@start - size of the attribute (request or attached) +@size - name length, path len and data size for given attribute +@ext - path length for given object -- cgit v1.2.3-70-g09d2 From 3ef1728898f0285c13aa6901f412b52835e23578 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 3 Apr 2009 08:29:20 +0300 Subject: nfsd41: Documentation/filesystems/nfs41-server.txt Initial nfs41 server write up describing the status of the linux server implementation. [nfsd41: document unenforced nfs41 compound ordering rules.] [get rid of CONFIG_NFSD_V4_1] Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- Documentation/filesystems/nfs41-server.txt | 161 +++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 Documentation/filesystems/nfs41-server.txt (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt new file mode 100644 index 00000000000..05d81cbcb2e --- /dev/null +++ b/Documentation/filesystems/nfs41-server.txt @@ -0,0 +1,161 @@ +NFSv4.1 Server Implementation + +Server support for minorversion 1 can be controlled using the +/proc/fs/nfsd/versions control file. The string output returned +by reading this file will contain either "+4.1" or "-4.1" +correspondingly. + +Currently, server support for minorversion 1 is disabled by default. +It can be enabled at run time by writing the string "+4.1" to +the /proc/fs/nfsd/versions control file. Note that to write this +control file, the nfsd service must be taken down. Use your user-mode +nfs-utils to set this up; see rpc.nfsd(8) + +The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based +on the latest NFSv4.1 Internet Draft: +http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 + +From the many new features in NFSv4.1 the current implementation +focuses on the mandatory-to-implement NFSv4.1 Sessions, providing +"exactly once" semantics and better control and throttling of the +resources allocated for each client. + +Other NFSv4.1 features, Parallel NFS operations in particular, +are still under development out of tree. +See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design +for more information. + +The table below, taken from the NFSv4.1 document, lists +the operations that are mandatory to implement (REQ), optional +(OPT), and NFSv4.0 operations that are required not to implement (MNI) +in minor version 1. The first column indicates the operations that +are not supported yet by the linux server implementation. + +The OPTIONAL features identified and their abbreviations are as follows: + pNFS Parallel NFS + FDELG File Delegations + DDELG Directory Delegations + +The following abbreviations indicate the linux server implementation status. + I Implemented NFSv4.1 operations. + NS Not Supported. + NS* unimplemented optional feature. + P pNFS features implemented out of tree. + PNS pNFS features that are not supported yet (out of tree). + +Operations + + +----------------------+------------+--------------+----------------+ + | Operation | REQ, REC, | Feature | Definition | + | | OPT, or | (REQ, REC, | | + | | MNI | or OPT) | | + +----------------------+------------+--------------+----------------+ + | ACCESS | REQ | | Section 18.1 | +NS | BACKCHANNEL_CTL | REQ | | Section 18.33 | +NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 | + | CLOSE | REQ | | Section 18.2 | + | COMMIT | REQ | | Section 18.3 | + | CREATE | REQ | | Section 18.4 | +I | CREATE_SESSION | REQ | | Section 18.36 | +NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 | + | DELEGRETURN | OPT | FDELG, | Section 18.6 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS | DESTROY_CLIENTID | REQ | | Section 18.50 | +I | DESTROY_SESSION | REQ | | Section 18.37 | +I | EXCHANGE_ID | REQ | | Section 18.35 | +NS | FREE_STATEID | REQ | | Section 18.38 | + | GETATTR | REQ | | Section 18.7 | +P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | +P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | + | GETFH | REQ | | Section 18.8 | +NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | +P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | +P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | +P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | + | LINK | OPT | | Section 18.9 | + | LOCK | REQ | | Section 18.10 | + | LOCKT | REQ | | Section 18.11 | + | LOCKU | REQ | | Section 18.12 | + | LOOKUP | REQ | | Section 18.13 | + | LOOKUPP | REQ | | Section 18.14 | + | NVERIFY | REQ | | Section 18.15 | + | OPEN | REQ | | Section 18.16 | +NS*| OPENATTR | OPT | | Section 18.17 | + | OPEN_CONFIRM | MNI | | N/A | + | OPEN_DOWNGRADE | REQ | | Section 18.18 | + | PUTFH | REQ | | Section 18.19 | + | PUTPUBFH | REQ | | Section 18.20 | + | PUTROOTFH | REQ | | Section 18.21 | + | READ | REQ | | Section 18.22 | + | READDIR | REQ | | Section 18.23 | + | READLINK | OPT | | Section 18.24 | +NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | + | RELEASE_LOCKOWNER | MNI | | N/A | + | REMOVE | REQ | | Section 18.25 | + | RENAME | REQ | | Section 18.26 | + | RENEW | MNI | | N/A | + | RESTOREFH | REQ | | Section 18.27 | + | SAVEFH | REQ | | Section 18.28 | + | SECINFO | REQ | | Section 18.29 | +NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, | + | | | layout (REQ) | Section 13.12 | +I | SEQUENCE | REQ | | Section 18.46 | + | SETATTR | REQ | | Section 18.30 | + | SETCLIENTID | MNI | | N/A | + | SETCLIENTID_CONFIRM | MNI | | N/A | +NS | SET_SSV | REQ | | Section 18.47 | +NS | TEST_STATEID | REQ | | Section 18.48 | + | VERIFY | REQ | | Section 18.31 | +NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 | + | WRITE | REQ | | Section 18.32 | + +Callback Operations + + +-------------------------+-----------+-------------+---------------+ + | Operation | REQ, REC, | Feature | Definition | + | | OPT, or | (REQ, REC, | | + | | MNI | or OPT) | | + +-------------------------+-----------+-------------+---------------+ + | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | +P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | +NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | +P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | +NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | +NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | + | CB_RECALL | OPT | FDELG, | Section 20.2 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS | CB_RECALL_SLOT | REQ | | Section 20.8 | +NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 | + | | | (REQ) | | +I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 | + | | | DDELG, pNFS | | + | | | (REQ) | | +NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | + | | | DDELG, pNFS | | + | | | (REQ) | | + +-------------------------+-----------+-------------+---------------+ + +Implementation notes: + +EXCHANGE_ID: +* only SP4_NONE state protection supported +* implementation ids are ignored + +CREATE_SESSION: +* backchannel attributes are ignored +* backchannel security parameters are ignored + +SEQUENCE: +* no support for dynamic slot table renegotiation (optional) + +nfsv4.1 COMPOUND rules: +The following cases aren't supported yet: +* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION, + DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. +* DESTROY_SESSION MUST be the final operation in the COMPOUND request. + -- cgit v1.2.3-70-g09d2 From 962281a7ab3aeb97eed004b8f0bdb0979cf35347 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 6 Apr 2009 19:01:20 -0700 Subject: nilfs2: add document This adds a document describing the features, mount options, userland tools, usage, disk format, and related URLs for the nilfs2 file system. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/00-INDEX | 2 + Documentation/filesystems/nilfs2.txt | 202 +++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 Documentation/filesystems/nilfs2.txt (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 52cd611277a..8dd6db76171 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -68,6 +68,8 @@ ncpfs.txt - info on Novell Netware(tm) filesystem using NCP protocol. nfsroot.txt - short guide on setting up a diskless box with NFS root filesystem. +nilfs2.txt + - info and mount options for the NILFS2 filesystem. ntfs.txt - info and mount options for the NTFS filesystem (Windows NT). ocfs2.txt diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt new file mode 100644 index 00000000000..3367fc44388 --- /dev/null +++ b/Documentation/filesystems/nilfs2.txt @@ -0,0 +1,202 @@ +NILFS2 +------ + +NILFS2 is a log-structured file system (LFS) supporting continuous +snapshotting. In addition to versioning capability of the entire file +system, users can even restore files mistakenly overwritten or +destroyed just a few seconds ago. Since NILFS2 can keep consistency +like conventional LFS, it achieves quick recovery after system +crashes. + +NILFS2 creates a number of checkpoints every few seconds or per +synchronous write basis (unless there is no change). Users can select +significant versions among continuously created checkpoints, and can +change them into snapshots which will be preserved until they are +changed back to checkpoints. + +There is no limit on the number of snapshots until the volume gets +full. Each snapshot is mountable as a read-only file system +concurrently with its writable mount, and this feature is convenient +for online backup. + +The userland tools are included in nilfs-utils package, which is +available from the following download page. At least "mkfs.nilfs2", +"mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called +cleaner or garbage collector) are required. Details on the tools are +described in the man pages included in the package. + +Project web page: http://www.nilfs.org/en/ +Download page: http://www.nilfs.org/en/download.html +Git tree web page: http://www.nilfs.org/git/ +NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users + +Caveats +======= + +Features which NILFS2 does not support yet: + + - atime + - extended attributes + - POSIX ACLs + - quotas + - writable snapshots + - remote backup (CDP) + - data integrity + - defragmentation + +Mount options +============= + +NILFS2 supports the following mount options: +(*) == default + +barrier=on(*) This enables/disables barriers. barrier=off disables + it, barrier=on enables it. +errors=continue(*) Keep going on a filesystem error. +errors=remount-ro Remount the filesystem read-only on an error. +errors=panic Panic and halt the machine if an error occurs. +cp=n Specify the checkpoint-number of the snapshot to be + mounted. Checkpoints and snapshots are listed by lscp + user command. Only the checkpoints marked as snapshot + are mountable with this option. Snapshot is read-only, + so a read-only mount option must be specified together. +order=relaxed(*) Apply relaxed order semantics that allows modified data + blocks to be written to disk without making a + checkpoint if no metadata update is going. This mode + is equivalent to the ordered data mode of the ext3 + filesystem except for the updates on data blocks still + conserve atomicity. This will improve synchronous + write performance for overwriting. +order=strict Apply strict in-order semantics that preserves sequence + of all file operations including overwriting of data + blocks. That means, it is guaranteed that no + overtaking of events occurs in the recovered file + system after a crash. + +NILFS2 usage +============ + +To use nilfs2 as a local file system, simply: + + # mkfs -t nilfs2 /dev/block_device + # mount -t nilfs2 /dev/block_device /dir + +This will also invoke the cleaner through the mount helper program +(mount.nilfs2). + +Checkpoints and snapshots are managed by the following commands. +Their manpages are included in the nilfs-utils package above. + + lscp list checkpoints or snapshots. + mkcp make a checkpoint or a snapshot. + chcp change an existing checkpoint to a snapshot or vice versa. + rmcp invalidate specified checkpoint(s). + +To mount a snapshot, + + # mount -t nilfs2 -r -o cp= /dev/block_device /snap_dir + +where is the checkpoint number of the snapshot. + +To unmount the NILFS2 mount point or snapshot, simply: + + # umount /dir + +Then, the cleaner daemon is automatically shut down by the umount +helper program (umount.nilfs2). + +Disk format +=========== + +A nilfs2 volume is equally divided into a number of segments except +for the super block (SB) and segment #0. A segment is the container +of logs. Each log is composed of summary information blocks, payload +blocks, and an optional super root block (SR): + + ______________________________________________________ + | |SB| | Segment | Segment | Segment | ... | Segment | | + |_|__|_|____0____|____1____|____2____|_____|____N____|_| + 0 +1K +4K +8M +16M +24M +(8MB x N) + . . (Typical offsets for 4KB-block) + . . + .______________________. + | log | log |... | log | + |__1__|__2__|____|__m__| + . . + . . + . . + .______________________________. + | Summary | Payload blocks |SR| + |_blocks__|_________________|__| + +The payload blocks are organized per file, and each file consists of +data blocks and B-tree node blocks: + + |<--- File-A --->|<--- File-B --->| + _______________________________________________________________ + | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ... + _|_____________|_______________|_____________|_______________|_ + + +Since only the modified blocks are written in the log, it may have +files without data blocks or B-tree node blocks. + +The organization of the blocks is recorded in the summary information +blocks, which contains a header structure (nilfs_segment_summary), per +file structures (nilfs_finfo), and per block structures (nilfs_binfo): + + _________________________________________________________________________ + | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |... + |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___ + + +The logs include regular files, directory files, symbolic link files +and several meta data files. The mata data files are the files used +to maintain file system meta data. The current version of NILFS2 uses +the following meta data files: + + 1) Inode file (ifile) -- Stores on-disk inodes + 2) Checkpoint file (cpfile) -- Stores checkpoints + 3) Segment usage file (sufile) -- Stores allocation state of segments + 4) Data address translation file -- Maps virtual block numbers to usual + (DAT) block numbers. This file serves to + make on-disk blocks relocatable. + 5) Sketch file (sketch) -- Keeps read-only data which can be + associated with checkpoints (optional) + +The following figure shows a typical organization of the logs: + + _________________________________________________________________________ + | Summary | regular file | file | ... | ifile | cpfile | sufile | DAT |SR| + |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__| + + +To stride over segment boundaries, this sequence of files may be split +into multiple logs. The sequence of logs that should be treated as +logically one log, is delimited with flags marked in the segment +summary. The recovery code of nilfs2 looks this boundary information +to ensure atomicity of updates. + +The super root block is inserted for every checkpoints. It includes +three special inodes, inodes for the DAT, cpfile, and sufile. Inodes +of regular files, directories, symlinks and other special files, are +included in the ifile. The inode of ifile itself is included in the +corresponding checkpoint entry in the cpfile. Thus, the hierarchy +among NILFS2 files can be depicted as follows: + + Super block (SB) + | + v + Super root block (the latest cno=xx) + |-- DAT + |-- sufile + `-- cpfile + |-- ifile (cno=c1) + |-- ifile (cno=c2) ---- file (ino=i1) + : : |-- file (ino=i2) + `-- ifile (cno=xx) |-- file (ino=i3) + : : + `-- file (ino=yy) + ( regular file, directory, or symlink ) + +For detail on the format of each file, please see include/linux/nilfs2_fs.h. -- cgit v1.2.3-70-g09d2 From 458c5b0822a669d170fdb7bb16c9145f652ebe06 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 6 Apr 2009 19:01:56 -0700 Subject: nilfs2: clean up sketch file The sketch file is a file to mark checkpoints with user data. It was experimentally introduced in the original implementation, and now obsolete. The file was handled differently with regular files; the file size got truncated when a checkpoint was created. This stops the special treatment and will treat it as a regular file. Most users are not affected because mkfs.nilfs2 no longer makes this file. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/nilfs2.txt | 2 -- fs/nilfs2/inode.c | 35 ++------------------------ fs/nilfs2/segment.c | 49 +----------------------------------- fs/nilfs2/segment.h | 8 ------ include/linux/nilfs2_fs.h | 2 -- 5 files changed, 3 insertions(+), 93 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 3367fc44388..55c4300abfc 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -161,8 +161,6 @@ the following meta data files: 4) Data address translation file -- Maps virtual block numbers to usual (DAT) block numbers. This file serves to make on-disk blocks relocatable. - 5) Sketch file (sketch) -- Keeps read-only data which can be - associated with checkpoints (optional) The following figure shows a typical organization of the logs: diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b6536bb2a32..a1922b17662 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -418,30 +418,6 @@ int nilfs_read_inode_common(struct inode *inode, return 0; } -static int nilfs_read_sketch_inode(struct inode *inode) -{ - struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); - int err = 0; - - if (sbi->s_snapshot_cno) { - struct the_nilfs *nilfs = sbi->s_nilfs; - struct buffer_head *bh_cp; - struct nilfs_checkpoint *raw_cp; - - err = nilfs_cpfile_get_checkpoint( - nilfs->ns_cpfile, sbi->s_snapshot_cno, 0, &raw_cp, - &bh_cp); - if (likely(!err)) { - if (!nilfs_checkpoint_sketch(raw_cp)) - inode->i_size = 0; - nilfs_cpfile_put_checkpoint( - nilfs->ns_cpfile, sbi->s_snapshot_cno, bh_cp); - } - inode->i_flags |= S_NOCMTIME; - } - return err; -} - static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, struct inode *inode) { @@ -469,11 +445,6 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, inode->i_op = &nilfs_file_inode_operations; inode->i_fop = &nilfs_file_operations; inode->i_mapping->a_ops = &nilfs_aops; - if (unlikely(inode->i_ino == NILFS_SKETCH_INO)) { - err = nilfs_read_sketch_inode(inode); - if (unlikely(err)) - goto failed_unmap; - } } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &nilfs_dir_inode_operations; inode->i_fop = &nilfs_dir_operations; @@ -742,8 +713,7 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); - if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state) || - unlikely(inode->i_ino == NILFS_SKETCH_INO)) + if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) return 0; spin_lock(&sbi->s_inode_lock); @@ -811,7 +781,6 @@ void nilfs_dirty_inode(struct inode *inode) return; } nilfs_transaction_begin(inode->i_sb, &ti, 0); - if (likely(inode->i_ino != NILFS_SKETCH_INO)) - nilfs_mark_inode_dirty(inode); + nilfs_mark_inode_dirty(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9a87410985b..981c34a0cd6 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -67,7 +67,6 @@ enum { NILFS_ST_INIT = 0, NILFS_ST_GC, /* Collecting dirty blocks for GC */ NILFS_ST_FILE, - NILFS_ST_SKETCH, NILFS_ST_IFILE, NILFS_ST_CPFILE, NILFS_ST_SUFILE, @@ -887,8 +886,7 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); - if (sci->sc_sketch_inode && i_size_read(sci->sc_sketch_inode) > 0) - nilfs_checkpoint_set_sketch(raw_cp); + nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); return 0; @@ -923,11 +921,6 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, nilfs_fill_in_file_bmap(ifile, ii); set_bit(NILFS_I_COLLECTED, &ii->i_state); } - if (sci->sc_sketch_inode) { - ii = NILFS_I(sci->sc_sketch_inode); - if (test_bit(NILFS_I_DIRTY, &ii->i_state)) - nilfs_fill_in_file_bmap(ifile, ii); - } } /* @@ -1228,26 +1221,6 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_stage.scnt = NILFS_ST_DONE; return 0; } - sci->sc_stage.scnt++; /* Fall through */ - case NILFS_ST_SKETCH: - if (mode == SC_LSEG_SR && sci->sc_sketch_inode) { - ii = NILFS_I(sci->sc_sketch_inode); - if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { - sci->sc_sketch_inode->i_ctime.tv_sec - = sci->sc_seg_ctime; - sci->sc_sketch_inode->i_mtime.tv_sec - = sci->sc_seg_ctime; - err = nilfs_mark_inode_dirty( - sci->sc_sketch_inode); - if (unlikely(err)) - goto break_or_fail; - } - err = nilfs_segctor_scan_file(sci, - sci->sc_sketch_inode, - &nilfs_sc_file_ops); - if (unlikely(err)) - goto break_or_fail; - } sci->sc_stage.scnt++; sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; /* Fall through */ @@ -2385,13 +2358,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) } while (sci->sc_stage.scnt != NILFS_ST_DONE); - /* Clearing sketch data */ - if (has_sr && sci->sc_sketch_inode) { - if (i_size_read(sci->sc_sketch_inode) == 0) - clear_bit(NILFS_I_DIRTY, - &NILFS_I(sci->sc_sketch_inode)->i_state); - i_size_write(sci->sc_sketch_inode, 0); - } out: nilfs_segctor_destroy_segment_buffers(sci); nilfs_segctor_check_out_files(sci, sbi); @@ -2971,11 +2937,6 @@ static int nilfs_segctor_init(struct nilfs_sc_info *sci, struct nilfs_recovery_info *ri) { int err; - struct inode *inode = nilfs_iget(sci->sc_super, NILFS_SKETCH_INO); - - sci->sc_sketch_inode = IS_ERR(inode) ? NULL : inode; - if (sci->sc_sketch_inode) - i_size_write(sci->sc_sketch_inode, 0); sci->sc_seq_done = sci->sc_seq_request; if (ri) @@ -2987,10 +2948,6 @@ static int nilfs_segctor_init(struct nilfs_sc_info *sci, if (ri) list_splice_init(&sci->sc_active_segments, ri->ri_used_segments.prev); - if (sci->sc_sketch_inode) { - iput(sci->sc_sketch_inode); - sci->sc_sketch_inode = NULL; - } } return err; } @@ -3090,10 +3047,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) WARN_ON(!list_empty(&sci->sc_segbufs)); - if (sci->sc_sketch_inode) { - iput(sci->sc_sketch_inode); - sci->sc_sketch_inode = NULL; - } down_write(&sbi->s_nilfs->ns_segctor_sem); kfree(sci); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 2dd39da9f38..fbd162d7170 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -108,7 +108,6 @@ struct nilfs_segsum_pointer { * @sc_nblk_this_inc: Number of blocks included in the current logical segment * @sc_seg_ctime: Creation time * @sc_flags: Internal flags - * @sc_sketch_inode: Inode of the sketch file * @sc_state_lock: spinlock for sc_state and so on * @sc_state: Segctord state flags * @sc_flush_request: inode bitmap of metadata files to be flushed @@ -158,13 +157,6 @@ struct nilfs_sc_info { unsigned long sc_flags; - /* - * Pointer to an inode of the sketch. - * This pointer is kept only while it contains data. - * We protect it with a semaphore of the segment constructor. - */ - struct inode *sc_sketch_inode; - spinlock_t sc_state_lock; unsigned long sc_state; unsigned long sc_flush_request; diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index aa93f0ee29d..e9c84aa4a8e 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -494,7 +494,6 @@ nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp) \ NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) NILFS_CHECKPOINT_FNS(INVALID, invalid) -NILFS_CHECKPOINT_FNS(SKETCH, sketch) /** * struct nilfs_cpinfo - checkpoint information @@ -527,7 +526,6 @@ nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo) \ NILFS_CPINFO_FNS(SNAPSHOT, snapshot) NILFS_CPINFO_FNS(INVALID, invalid) -NILFS_CPINFO_FNS(SKETCH, sketch) /** -- cgit v1.2.3-70-g09d2 From e0ca87391694dfacd01465d5c01c579c3b8b63e0 Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Fri, 27 Mar 2009 15:04:29 +0300 Subject: Staging: Pohmelfs: Added IO permissions and priorities. Signed-off-by: Evgeniy Polyakov Signed-off-by: Greg Kroah-Hartman --- .../filesystems/pohmelfs/design_notes.txt | 5 +- Documentation/filesystems/pohmelfs/info.txt | 21 ++++++-- drivers/staging/pohmelfs/config.c | 61 ++++++++++++++++++++-- drivers/staging/pohmelfs/netfs.h | 1 + drivers/staging/pohmelfs/trans.c | 30 ++++------- 5 files changed, 86 insertions(+), 32 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt index 6d6db60d567..dcf83358716 100644 --- a/Documentation/filesystems/pohmelfs/design_notes.txt +++ b/Documentation/filesystems/pohmelfs/design_notes.txt @@ -56,9 +56,10 @@ workloads and can fully utilize the bandwidth to the servers when doing bulk data transfers. POHMELFS clients operate with a working set of servers and are capable of balancing read-only -operations (like lookups or directory listings) between them. +operations (like lookups or directory listings) between them according to IO priorities. Administrators can add or remove servers from the set at run-time via special commands (described -in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers. +in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers, which are connected +with write permission turned on. IO priority and permissions can be changed in run-time. POHMELFS is capable of full data channel encryption and/or strong crypto hashing. One can select any kernel supported cipher, encryption mode, hash type and operation mode diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt index 4e3d5015708..db2e4139362 100644 --- a/Documentation/filesystems/pohmelfs/info.txt +++ b/Documentation/filesystems/pohmelfs/info.txt @@ -1,6 +1,8 @@ POHMELFS usage information. -Mount options: +Mount options. +All but index, number of crypto threads and maximum IO size can changed via remount. + idx=%u Each mountpoint is associated with a special index via this option. Administrator can add or remove servers from the given index, so all mounts, @@ -52,16 +54,27 @@ mcache_timeout=%u Usage examples. -Add (or remove if it already exists) server server1.net:1025 into the working set with index $idx +Add server server1.net:1025 into the working set with index $idx with appropriate hash algorithm and key file and cipher algorithm, mode and key file: -$cfg -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key +$cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key Mount filesystem with given index $idx to /mnt mountpoint. Client will connect to all servers specified in the working set via previous command: mount -t pohmel -o idx=$idx q /mnt -One can add or remove servers from working set after mounting too. +Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw): +$cfg A modify -a server1.net -p 1025 -i $idx -I 1 + +Change IO priority to 123 (node with the highest priority gets read requests). +$cfg A modify -a server1.net -p 1025 -i $idx -P 123 +One can check currect status of all connections in the mountstats file: +# cat /proc/$PID/mountstats +... +device none mounted on /mnt with fstype pohmel +idx addr(:port) socket_type protocol active priority permissions +0 server1.net:1026 1 6 1 250 1 +0 server2.net:1025 1 6 1 123 3 Server installation. diff --git a/drivers/staging/pohmelfs/config.c b/drivers/staging/pohmelfs/config.c index 3e67da9ea38..a6eaa42fb66 100644 --- a/drivers/staging/pohmelfs/config.c +++ b/drivers/staging/pohmelfs/config.c @@ -81,6 +81,45 @@ static struct pohmelfs_config_group *pohmelfs_find_create_config_group(unsigned return g; } +static inline void pohmelfs_insert_config_entry(struct pohmelfs_sb *psb, struct pohmelfs_config *dst) +{ + struct pohmelfs_config *tmp; + + INIT_LIST_HEAD(&dst->config_entry); + + list_for_each_entry(tmp, &psb->state_list, config_entry) { + if (dst->state.ctl.prio > tmp->state.ctl.prio) + list_add_tail(&dst->config_entry, &tmp->config_entry); + } + if (list_empty(&dst->config_entry)) + list_add_tail(&dst->config_entry, &psb->state_list); +} + +static int pohmelfs_move_config_entry(struct pohmelfs_sb *psb, + struct pohmelfs_config *dst, struct pohmelfs_config *new) +{ + if ((dst->state.ctl.prio == new->state.ctl.prio) && + (dst->state.ctl.perm == new->state.ctl.perm)) + return 0; + + dprintk("%s: dst: prio: %d, perm: %x, new: prio: %d, perm: %d.\n", + __func__, dst->state.ctl.prio, dst->state.ctl.perm, + new->state.ctl.prio, new->state.ctl.perm); + dst->state.ctl.prio = new->state.ctl.prio; + dst->state.ctl.perm = new->state.ctl.perm; + + list_del_init(&dst->config_entry); + pohmelfs_insert_config_entry(psb, dst); + return 0; +} + +/* + * pohmelfs_copy_config() is used to copy new state configs from the + * config group (controlled by the netlink messages) into the superblock. + * This happens either at startup time where no transactions can access + * the list of the configs (and thus list of the network states), or at + * run-time, where it is protected by the psb->state_lock. + */ int pohmelfs_copy_config(struct pohmelfs_sb *psb) { struct pohmelfs_config_group *g; @@ -103,7 +142,9 @@ int pohmelfs_copy_config(struct pohmelfs_sb *psb) err = 0; list_for_each_entry(dst, &psb->state_list, config_entry) { if (pohmelfs_config_eql(&dst->state.ctl, &c->state.ctl)) { - err = -EEXIST; + err = pohmelfs_move_config_entry(psb, dst, c); + if (!err) + err = -EEXIST; break; } } @@ -119,7 +160,7 @@ int pohmelfs_copy_config(struct pohmelfs_sb *psb) memcpy(&dst->state.ctl, &c->state.ctl, sizeof(struct pohmelfs_ctl)); - list_add_tail(&dst->config_entry, &psb->state_list); + pohmelfs_insert_config_entry(psb, dst); err = pohmelfs_state_init_one(psb, dst); if (err) { @@ -248,6 +289,13 @@ out_unlock: return err; } +static int pohmelfs_modify_config(struct pohmelfs_ctl *old, struct pohmelfs_ctl *new) +{ + old->perm = new->perm; + old->prio = new->prio; + return 0; +} + static int pohmelfs_cn_ctl(struct cn_msg *msg, int action) { struct pohmelfs_config_group *g; @@ -278,6 +326,9 @@ static int pohmelfs_cn_ctl(struct cn_msg *msg, int action) g->num_entry--; kfree(c); goto out_unlock; + } else if (action == POHMELFS_FLAGS_MODIFY) { + err = pohmelfs_modify_config(sc, ctl); + goto out_unlock; } else { err = -EEXIST; goto out_unlock; @@ -296,6 +347,7 @@ static int pohmelfs_cn_ctl(struct cn_msg *msg, int action) } memcpy(&c->state.ctl, ctl, sizeof(struct pohmelfs_ctl)); g->num_entry++; + list_add_tail(&c->config_entry, &g->config_list); out_unlock: @@ -401,10 +453,9 @@ static void pohmelfs_cn_callback(void *data) switch (msg->flags) { case POHMELFS_FLAGS_ADD: - err = pohmelfs_cn_ctl(msg, POHMELFS_FLAGS_ADD); - break; case POHMELFS_FLAGS_DEL: - err = pohmelfs_cn_ctl(msg, POHMELFS_FLAGS_DEL); + case POHMELFS_FLAGS_MODIFY: + err = pohmelfs_cn_ctl(msg, msg->flags); break; case POHMELFS_FLAGS_SHOW: err = pohmelfs_cn_disp(msg); diff --git a/drivers/staging/pohmelfs/netfs.h b/drivers/staging/pohmelfs/netfs.h index 7700e2bf3cc..c78cfcb042f 100644 --- a/drivers/staging/pohmelfs/netfs.h +++ b/drivers/staging/pohmelfs/netfs.h @@ -87,6 +87,7 @@ enum { POHMELFS_FLAGS_DEL, /* Network state control message for DEL */ POHMELFS_FLAGS_SHOW, /* Network state control message for SHOW */ POHMELFS_FLAGS_CRYPTO, /* Crypto data control message */ + POHMELFS_FLAGS_MODIFY, /* Network state modification message */ }; /* diff --git a/drivers/staging/pohmelfs/trans.c b/drivers/staging/pohmelfs/trans.c index b89f9f36b35..168fc8998c3 100644 --- a/drivers/staging/pohmelfs/trans.c +++ b/drivers/staging/pohmelfs/trans.c @@ -456,34 +456,22 @@ int netfs_trans_finish_send(struct netfs_trans *t, struct pohmelfs_sb *psb) __func__, t, t->gen, t->iovec.iov_len, t->page_num, psb->active_state); #endif mutex_lock(&psb->state_lock); - - if ((t->flags & NETFS_TRANS_SINGLE_DST) && psb->active_state) { - st = &psb->active_state->state; - - err = -EPIPE; - if (netfs_state_poll(st) & POLLOUT) { - err = netfs_trans_push_dst(t, st); - if (!err) { - err = netfs_trans_send(t, st); - if (err) { - netfs_trans_drop_last(t, st); - } else { - pohmelfs_switch_active(psb); - goto out; - } - } - } - pohmelfs_switch_active(psb); - } - list_for_each_entry(c, &psb->state_list, config_entry) { st = &c->state; + if (t->flags & NETFS_TRANS_SINGLE_DST) { + if (!(st->ctl.perm & POHMELFS_IO_PERM_READ)) + continue; + } else { + if (!(st->ctl.perm & POHMELFS_IO_PERM_WRITE)) + continue; + } + err = netfs_trans_push(t, st); if (!err && (t->flags & NETFS_TRANS_SINGLE_DST)) break; } -out: + mutex_unlock(&psb->state_lock); #if 0 dprintk("%s: fully sent t: %p, gen: %u, size: %u, page_num: %u, err: %d.\n", -- cgit v1.2.3-70-g09d2 From 66672fefaa91802fec51c3fe0cc55bc9baea5a2d Mon Sep 17 00:00:00 2001 From: Adrian McMenamin Date: Mon, 20 Apr 2009 18:38:28 -0700 Subject: Documentation/filesystems: remove out of date reference to BKL being held Documentation/filesystems/vfs.txt incorrectly states that the kernel is locked during the call to statfs (Documentation/filesystems/Locking correctly says it is not). This patch removes the offending sentence. remove reference to BKL being held in statfs Signed-off-by: Adrian McMenamin Signed-off-by: Randy Dunlap Cc: Alexander Viro Signed-off-by: Al Viro --- Documentation/filesystems/vfs.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index deeeed0faa8..f49eecf2e57 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -277,8 +277,7 @@ or bottom half). unfreeze_fs: called when VFS is unlocking a filesystem and making it writable again. - statfs: called when the VFS needs to get filesystem statistics. This - is called with the kernel lock held + statfs: called when the VFS needs to get filesystem statistics. remount_fs: called when the filesystem is remounted. This is called with the kernel lock held -- cgit v1.2.3-70-g09d2 From 91ac033d8377552d3654501a105ab55bf546940e Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 23 Apr 2009 11:21:55 +0100 Subject: CacheFiles: Fix the documentation to use the correct credential pointer names Adjust the CacheFiles documentation to use the correct names of the credential pointers in task_struct. The documentation was using names from the old versions of the credentials patches. Signed-off-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- Documentation/filesystems/caching/cachefiles.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt index c78a49b7bba..748a1ae49e1 100644 --- a/Documentation/filesystems/caching/cachefiles.txt +++ b/Documentation/filesystems/caching/cachefiles.txt @@ -407,7 +407,7 @@ A NOTE ON SECURITY ================== CacheFiles makes use of the split security in the task_struct. It allocates -its own task_security structure, and redirects current->act_as to point to it +its own task_security structure, and redirects current->cred to point to it when it acts on behalf of another process, in that process's context. The reason it does this is that it calls vfs_mkdir() and suchlike rather than @@ -429,9 +429,9 @@ This means it may lose signals or ptrace events for example, and affects what the process looks like in /proc. So CacheFiles makes use of a logical split in the security between the -objective security (task->sec) and the subjective security (task->act_as). The -objective security holds the intrinsic security properties of a process and is -never overridden. This is what appears in /proc, and is what is used when a +objective security (task->real_cred) and the subjective security (task->cred). +The objective security holds the intrinsic security properties of a process and +is never overridden. This is what appears in /proc, and is what is used when a process is the target of an operation by some other process (SIGKILL for example). -- cgit v1.2.3-70-g09d2 From b827e496c893de0c0f142abfaeb8730a2fd6b37f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 30 Apr 2009 15:08:16 -0700 Subject: mm: close page_mkwrite races Change page_mkwrite to allow implementations to return with the page locked, and also change it's callers (in page fault paths) to hold the lock until the page is marked dirty. This allows the filesystem to have full control of page dirtying events coming from the VM. Rather than simply hold the page locked over the page_mkwrite call, we call page_mkwrite with the page unlocked and allow callers to return with it locked, so filesystems can avoid LOR conditions with page lock. The problem with the current scheme is this: a filesystem that wants to associate some metadata with a page as long as the page is dirty, will perform this manipulation in its ->page_mkwrite. It currently then must return with the page unlocked and may not hold any other locks (according to existing page_mkwrite convention). In this window, the VM could write out the page, clearing page-dirty. The filesystem has no good way to detect that a dirty pte is about to be attached, so it will happily write out the page, at which point, the filesystem may manipulate the metadata to reflect that the page is no longer dirty. It is not always possible to perform the required metadata manipulation in ->set_page_dirty, because that function cannot block or fail. The filesystem may need to allocate some data structure, for example. And the VM cannot mark the pte dirty before page_mkwrite, because page_mkwrite is allowed to fail, so we must not allow any window where the page could be written to if page_mkwrite does fail. This solution of holding the page locked over the 3 critical operations (page_mkwrite, setting the pte dirty, and finally setting the page dirty) closes out races nicely, preventing page cleaning for writeout being initiated in that window. This provides the filesystem with a strong synchronisation against the VM here. - Sage needs this race closed for ceph filesystem. - Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913). - I need it for fsblock. - I suspect other filesystems may need it too (eg. btrfs). - I have converted buffer.c to the new locking. Even simple block allocation under dirty pages might be susceptible to i_size changing under partial page at the end of file (we also have a buffer.c-side problem here, but it cannot be fixed properly without this patch). - Other filesystems (eg. NFS, maybe btrfs) will need to change their page_mkwrite functions themselves. [ This also moves page_mkwrite another step closer to fault, which should eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a filesystem calldown and page lock/unlock cycle in __do_fault. ] [akpm@linux-foundation.org: fix derefs of NULL ->mapping] Cc: Sage Weil Cc: Trond Myklebust Signed-off-by: Nick Piggin Cc: Valdis Kletnieks Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 24 ++++++--- fs/buffer.c | 10 ++-- mm/memory.c | 108 +++++++++++++++++++++++++++----------- 3 files changed, 98 insertions(+), 44 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 76efe5b71d7..3120f8dd2c3 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -512,16 +512,24 @@ locking rules: BKL mmap_sem PageLocked(page) open: no yes close: no yes -fault: no yes -page_mkwrite: no yes no +fault: no yes can return with page locked +page_mkwrite: no yes can return with page locked access: no yes - ->page_mkwrite() is called when a previously read-only page is -about to become writeable. The file system is responsible for -protecting against truncate races. Once appropriate action has been -taking to lock out truncate, the page range should be verified to be -within i_size. The page mapping should also be checked that it is not -NULL. + ->fault() is called when a previously not present pte is about +to be faulted in. The filesystem must find and return the page associated +with the passed in "pgoff" in the vm_fault structure. If it is possible that +the page may be truncated and/or invalidated, then the filesystem must lock +the page, then ensure it is not already truncated (the page lock will block +subsequent truncate), and then return with VM_FAULT_LOCKED, and the page +locked. The VM will unlock the page. + + ->page_mkwrite() is called when a previously read-only pte is +about to become writeable. The filesystem again must ensure that there are +no truncate/invalidate races, and then return with the page locked. If +the page has been truncated, the filesystem should not look up a new page +like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which +will cause the VM to retry the fault. ->access() is called when get_user_pages() fails in acces_process_vm(), typically used to debug a process through diff --git a/fs/buffer.c b/fs/buffer.c index b3e5be7514f..aed297739eb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2397,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if ((page->mapping != inode->i_mapping) || (page_offset(page) > size)) { /* page got truncated out from underneath us */ - goto out_unlock; + unlock_page(page); + goto out; } /* page is wholly or partially inside EOF */ @@ -2411,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, ret = block_commit_write(page, 0, end); if (unlikely(ret)) { + unlock_page(page); if (ret == -ENOMEM) ret = VM_FAULT_OOM; else /* -ENOSPC, -EIO, etc */ ret = VM_FAULT_SIGBUS; - } + } else + ret = VM_FAULT_LOCKED; -out_unlock: - unlock_page(page); +out: return ret; } diff --git a/mm/memory.c b/mm/memory.c index 6a4ef0fd071..4126dd16778 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = tmp; goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(old_page); + if (!old_page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(old_page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(old_page)); /* * Since we dropped the lock we need to revalidate @@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); - if (!pte_same(*page_table, orig_pte)) + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); goto unlock; + } page_mkwrite = 1; } @@ -2094,9 +2105,6 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); - /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty @@ -2105,16 +2113,41 @@ unlock: * * do_no_page is protected similarly. */ - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); } return ret; oom_free_new: page_cache_release(new_page); oom: - if (old_page) + if (old_page) { + if (page_mkwrite) { + unlock_page(old_page); + page_cache_release(old_page); + } page_cache_release(old_page); + } return VM_FAULT_OOM; unwritable_page: @@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, int tmp; unlock_page(page); - vmf.flags |= FAULT_FLAG_MKWRITE; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; tmp = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; - anon = 1; /* no anon but release vmf.page */ - goto out_unlocked; - } - lock_page(page); - /* - * XXX: this is not quite right (racy vs - * invalidate) to unlock and relock the page - * like this, however a better fix requires - * reworking page_mkwrite locking API, which - * is better done later. - */ - if (!page->mapping) { - ret = 0; - anon = 1; /* no anon but release vmf.page */ - goto out; + goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(page)); page_mkwrite = 1; } } @@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); out: - unlock_page(vmf.page); -out_unlocked: - if (anon) - page_cache_release(vmf.page); - else if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); + if (dirty_page) { + struct address_space *mapping = page->mapping; - set_page_dirty_balance(dirty_page, page_mkwrite); + if (set_page_dirty(dirty_page)) + page_mkwrite = 1; + unlock_page(dirty_page); put_page(dirty_page); + if (page_mkwrite && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + } else { + unlock_page(vmf.page); + if (anon) + page_cache_release(vmf.page); } return ret; + +unwritable_page: + page_cache_release(page); + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3-70-g09d2