diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 30 | ||||
-rw-r--r-- | Documentation/filesystems/caching/fscache.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/nilfs2.txt | 12 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 45 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 97 | ||||
-rw-r--r-- | Documentation/filesystems/squashfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs-pci.txt | 7 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs.txt | 46 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 6 |
9 files changed, 164 insertions, 91 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 96d4293607e..2db4283efa8 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -92,8 +92,8 @@ prototypes: void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); int (*write_inode) (struct inode *, int); - void (*drop_inode) (struct inode *); - void (*delete_inode) (struct inode *); + int (*drop_inode) (struct inode *); + void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); @@ -101,14 +101,13 @@ prototypes: int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); - void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct vfsmount *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); locking rules: - All may block. + All may block [not true, see below] None have BKL s_umount alloc_inode: @@ -116,22 +115,25 @@ destroy_inode: dirty_inode: (must not sleep) write_inode: drop_inode: !!!inode_lock!!! -delete_inode: +evict_inode: put_super: write write_super: read sync_fs: read freeze_fs: read unfreeze_fs: read -statfs: no -remount_fs: maybe (see below) -clear_inode: +statfs: maybe(read) (see below) +remount_fs: write umount_begin: no show_options: no (namespace_sem) quota_read: no (see below) quota_write: no (see below) -->remount_fs() will have the s_umount exclusive lock if it's already mounted. -When called from get_sb_single, it does NOT have the s_umount lock. +->statfs() has s_umount (shared) when called by ustat(2) (native or +compat), but that's an accident of bad API; s_umount is used to pin +the superblock down when we only have dev_t given us by userland to +identify the superblock. Everything else (statfs(), fstatfs(), etc.) +doesn't hold it when calling ->statfs() - superblock is pinned down +by resolving the pathname passed to syscall. ->quota_read() and ->quota_write() functions are both guaranteed to be the only ones operating on the quota file by the quota code (via dqio_sem) (unless an admin really wants to screw up something and @@ -372,8 +374,6 @@ prototypes: ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); - int (*ioctl) (struct inode *, struct file *, unsigned int, - unsigned long); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -407,8 +407,7 @@ write: no aio_write: no readdir: no poll: no -ioctl: yes (see below) -unlocked_ioctl: no (see below) +unlocked_ioctl: no compat_ioctl: no mmap: no open: no @@ -451,9 +450,6 @@ move ->readdir() to inode_operations and use a separate method for directory anything that resembles union-mount we won't have a struct file for all components. And there are other reasons why the current interface is a mess... -->ioctl() on regular files is superceded by the ->unlocked_ioctl() that -doesn't take the BKL. - ->read on directories probably must go away - we should just enforce -EISDIR in sys_read() and friends. diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt index a91e2e2095b..770267af5b3 100644 --- a/Documentation/filesystems/caching/fscache.txt +++ b/Documentation/filesystems/caching/fscache.txt @@ -343,8 +343,8 @@ This will look something like: [root@andromeda ~]# head /proc/fs/fscache/objects OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ - 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a - 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a + 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a + 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a where the first set of columns before the '|' describe the object: @@ -362,7 +362,7 @@ where the first set of columns before the '|' describe the object: EM Object's event mask EV Events raised on this object F Object flags - S Object slow-work work item flags + S Object work item busy state mask (1:pending 2:running) and the second set of columns describe the object's cookie, if present: @@ -395,8 +395,8 @@ and the following paired letters: w Show objects that don't have pending writes R Show objects that have outstanding reads r Show objects that don't have outstanding reads - S Show objects that have slow work queued - s Show objects that don't have slow work queued + S Show objects that have work queued + s Show objects that don't have work queued If neither side of a letter pair is given, then both are implied. For example: diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index d3e7673995e..d5c0cef38a7 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -49,7 +49,10 @@ Mount options NILFS2 supports the following mount options: (*) == default -nobarrier Disables barriers. +barrier(*) This enables/disables the use of write barriers. This +nobarrier requires an IO stack which can support barriers, and + if nilfs gets an error on a barrier write, it will + disable again with a warning. errors=continue Keep going on a filesystem error. errors=remount-ro(*) Remount the filesystem read-only on an error. errors=panic Panic and halt the machine if an error occurs. @@ -74,9 +77,10 @@ norecovery Disable recovery of the filesystem on mount. This disables every write access on the device for read-only mounts or snapshots. This option will fail for r/w mounts on an unclean volume. -discard Issue discard/TRIM commands to the underlying block - device when blocks are freed. This is useful for SSD - devices and sparse/thinly-provisioned LUNs. +discard This enables/disables the use of discard/TRIM commands. +nodiscard(*) The discard/TRIM commands are sent to the underlying + block device when blocks are freed. This is useful + for SSD devices and sparse/thinly-provisioned LUNs. NILFS2 usage ============ diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index a7e9746ee7e..b12c8953868 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -273,3 +273,48 @@ it's safe to remove it. If you don't need it, remove it. deliberate; as soon as struct block_device * is propagated in a reasonable way by that code fixing will become trivial; until then nothing can be done. + +[mandatory] + + block truncatation on error exit from ->write_begin, and ->direct_IO +moved from generic methods (block_write_begin, cont_write_begin, +nobh_write_begin, blockdev_direct_IO*) to callers. Take a look at +ext2_write_failed and callers for an example. + +[mandatory] + + ->truncate is going away. The whole truncate sequence needs to be +implemented in ->setattr, which is now mandatory for filesystems +implementing on-disk size changes. Start with a copy of the old inode_setattr +and vmtruncate, and the reorder the vmtruncate + foofs_vmtruncate sequence to +be in order of zeroing blocks using block_truncate_page or similar helpers, +size update and on finally on-disk truncation which should not fail. +inode_change_ok now includes the size checks for ATTR_SIZE and must be called +in the beginning of ->setattr unconditionally. + +[mandatory] + + ->clear_inode() and ->delete_inode() are gone; ->evict_inode() should +be used instead. It gets called whenever the inode is evicted, whether it has +remaining links or not. Caller does *not* evict the pagecache or inode-associated +metadata buffers; getting rid of those is responsibility of method, as it had +been for ->delete_inode(). + ->drop_inode() returns int now; it's called on final iput() with inode_lock +held and it returns true if filesystems wants the inode to be dropped. As before, +generic_drop_inode() is still the default and it's been updated appropriately. +generic_delete_inode() is also alive and it consists simply of return 1. Note that +all actual eviction work is done by caller after ->drop_inode() returns. + clear_inode() is gone; use end_writeback() instead. As before, it must +be called exactly once on each call of ->evict_inode() (as it used to be for +each call of ->delete_inode()). Unlike before, if you are using inode-associated +metadata buffers (i.e. mark_buffer_dirty_inode()), it's your responsibility to +call invalidate_inode_buffers() before end_writeback(). + No async writeback (and thus no calls of ->write_inode()) will happen +after end_writeback() returns, so actions that should not overlap with ->write_inode() +(e.g. freeing on-disk inode if i_nlink is 0) ought to be done after that call. + + NOTE: checking i_nlink in the beginning of ->write_inode() and bailing out +if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput() +may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly +free the on-disk inode, you may end up doing that while ->write_inode() is writing +to it. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 8fe8895894d..a6aca874088 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -33,7 +33,8 @@ Table of Contents 2 Modifying System Parameters 3 Per-Process Parameters - 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score + 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer + score 3.2 /proc/<pid>/oom_score - Display current oom-killer score 3.3 /proc/<pid>/io - Display the IO accounting fields 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings @@ -1234,42 +1235,64 @@ of the kernel. CHAPTER 3: PER-PROCESS PARAMETERS ------------------------------------------------------------------------------ -3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score ------------------------------------------------------- - -This file can be used to adjust the score used to select which processes -should be killed in an out-of-memory situation. Giving it a high score will -increase the likelihood of this process being killed by the oom-killer. Valid -values are in the range -16 to +15, plus the special value -17, which disables -oom-killing altogether for this process. - -The process to be killed in an out-of-memory situation is selected among all others -based on its badness score. This value equals the original memory size of the process -and is then updated according to its CPU time (utime + stime) and the -run time (uptime - start time). The longer it runs the smaller is the score. -Badness score is divided by the square root of the CPU time and then by -the double square root of the run time. - -Swapped out tasks are killed first. Half of each child's memory size is added to -the parent's score if they do not share the same memory. Thus forking servers -are the prime candidates to be killed. Having only one 'hungry' child will make -parent less preferable than the child. - -/proc/<pid>/oom_score shows process' current badness score. - -The following heuristics are then applied: - * if the task was reniced, its score doubles - * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE - or CAP_SYS_RAWIO) have their score divided by 4 - * if oom condition happened in one cpuset and checked process does not belong - to it, its score is divided by 8 - * the resulting score is multiplied by two to the power of oom_adj, i.e. - points <<= oom_adj when it is positive and - points >>= -(oom_adj) otherwise - -The task with the highest badness score is then selected and its children -are killed, process itself will be killed in an OOM situation when it does -not have children or some of them disabled oom like described above. +3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score +-------------------------------------------------------------------------------- + +These file can be used to adjust the badness heuristic used to select which +process gets killed in out of memory conditions. + +The badness heuristic assigns a value to each candidate task ranging from 0 +(never kill) to 1000 (always kill) to determine which process is targeted. The +units are roughly a proportion along that range of allowed memory the process +may allocate from based on an estimation of its current memory and swap use. +For example, if a task is using all allowed memory, its badness score will be +1000. If it is using half of its allowed memory, its score will be 500. + +There is an additional factor included in the badness score: root +processes are given 3% extra memory over other tasks. + +The amount of "allowed" memory depends on the context in which the oom killer +was called. If it is due to the memory assigned to the allocating task's cpuset +being exhausted, the allowed memory represents the set of mems assigned to that +cpuset. If it is due to a mempolicy's node(s) being exhausted, the allowed +memory represents the set of mempolicy nodes. If it is due to a memory +limit (or swap limit) being reached, the allowed memory is that configured +limit. Finally, if it is due to the entire system being out of memory, the +allowed memory represents all allocatable resources. + +The value of /proc/<pid>/oom_score_adj is added to the badness score before it +is used to determine which task to kill. Acceptable values range from -1000 +(OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX). This allows userspace to +polarize the preference for oom killing either by always preferring a certain +task or completely disabling it. The lowest possible value, -1000, is +equivalent to disabling oom killing entirely for that task since it will always +report a badness score of 0. + +Consequently, it is very simple for userspace to define the amount of memory to +consider for each task. Setting a /proc/<pid>/oom_score_adj value of +500, for +example, is roughly equivalent to allowing the remainder of tasks sharing the +same system, cpuset, mempolicy, or memory controller resources to use at least +50% more memory. A value of -500, on the other hand, would be roughly +equivalent to discounting 50% of the task's allowed memory from being considered +as scoring against the task. + +For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also +be used to tune the badness score. Its acceptable values range from -16 +(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 +(OOM_DISABLE) to disable oom killing entirely for that task. Its value is +scaled linearly with /proc/<pid>/oom_score_adj. + +Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the +other with its scaled value. + +NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see +Documentation/feature-removal-schedule.txt. + +Caveat: when a parent task is selected, the oom killer will sacrifice any first +generation children with seperate address spaces instead, if possible. This +avoids servers and important system daemons from being killed and loses the +minimal amount of work. + 3.2 /proc/<pid>/oom_score - Display current oom-killer score ------------------------------------------------------------- diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index 203f7202cc9..66699afd66c 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt @@ -2,7 +2,7 @@ SQUASHFS 4.0 FILESYSTEM ======================= Squashfs is a compressed read-only filesystem for Linux. -It uses zlib compression to compress files, inodes and directories. +It uses zlib/lzo compression to compress files, inodes and directories. Inodes in the system are very small and all blocks are packed to minimise data overhead. Block sizes greater than 4K are supported up to a maximum of 1Mbytes (default block size 128K). diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 85354b32d73..74eaac26f8b 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -39,7 +39,7 @@ files, each with their own function. local_cpus nearby CPU mask (cpumask, ro) remove remove device from kernel's list (ascii, wo) resource PCI resource host addresses (ascii, ro) - resource0..N PCI resource N, if present (binary, mmap) + resource0..N PCI resource N, if present (binary, mmap, rw[1]) resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) rom PCI ROM resource, if present (binary, ro) subsystem_device PCI subsystem device (ascii, ro) @@ -54,13 +54,16 @@ files, each with their own function. binary - file contains binary data cpumask - file contains a cpumask type +[1] rw for RESOURCE_IO (I/O port) regions only + The read only files are informational, writes to them will be ignored, with the exception of the 'rom' file. Writable files can be used to perform actions on the device (e.g. changing config space, detaching a device). mmapable files are available via an mmap of the file at offset 0 and can be used to do actual device programming from userspace. Note that some platforms don't support mmapping of certain resources, so be sure to check the return -value from any attempted mmap. +value from any attempted mmap. The most notable of these are I/O port +resources, which also provide read/write access. The 'enable' file provides a counter that indicates how many times the device has been enabled. If the 'enable' file currently returns '4', and a '1' is diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 931c806642c..5d1335faec2 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -4,7 +4,7 @@ sysfs - _The_ filesystem for exporting kernel objects. Patrick Mochel <mochel@osdl.org> Mike Murphy <mamurph@cs.clemson.edu> -Revised: 22 February 2009 +Revised: 15 July 2010 Original: 10 January 2003 @@ -124,7 +124,7 @@ show and store methods of the attribute owners. struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); - ssize_t (*store)(struct kobject *, struct attribute *, const char *); + ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; [ Subsystems should have already defined a struct kobj_type as a @@ -139,18 +139,22 @@ calls the associated methods. To illustrate: +#define to_dev(obj) container_of(obj, struct device, kobj) #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr) -#define to_dev(d) container_of(d, struct device, kobj) -static ssize_t -dev_attr_show(struct kobject * kobj, struct attribute * attr, char * buf) +static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) { - struct device_attribute * dev_attr = to_dev_attr(attr); - struct device * dev = to_dev(kobj); - ssize_t ret = 0; + struct device_attribute *dev_attr = to_dev_attr(attr); + struct device *dev = to_dev(kobj); + ssize_t ret = -EIO; if (dev_attr->show) - ret = dev_attr->show(dev, buf); + ret = dev_attr->show(dev, dev_attr, buf); + if (ret >= (ssize_t)PAGE_SIZE) { + print_symbol("dev_attr_show: %s returned bad count\n", + (unsigned long)dev_attr->show); + } return ret; } @@ -163,10 +167,9 @@ To read or write attributes, show() or store() methods must be specified when declaring the attribute. The method types should be as simple as those defined for device attributes: -ssize_t (*show)(struct device * dev, struct device_attribute * attr, - char * buf); -ssize_t (*store)(struct device * dev, struct device_attribute * attr, - const char * buf); +ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf); +ssize_t (*store)(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); IOW, they should take only an object, an attribute, and a buffer as parameters. @@ -209,8 +212,8 @@ Other notes: - show() should always use snprintf(). -- store() should return the number of bytes used from the buffer. This - can be done using strlen(). +- store() should return the number of bytes used from the buffer. If the + entire buffer has been used, just return the count argument. - show() or store() can always return errors. If a bad value comes through, be sure to return an error. @@ -223,15 +226,18 @@ Other notes: A very simple (and naive) implementation of a device attribute is: -static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t show_name(struct device *dev, struct device_attribute *attr, + char *buf) { return snprintf(buf, PAGE_SIZE, "%s\n", dev->name); } -static ssize_t store_name(struct device * dev, const char * buf) +static ssize_t store_name(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { - sscanf(buf, "%20s", dev->name); - return strnlen(buf, PAGE_SIZE); + snprintf(dev->name, sizeof(dev->name), "%.*s", + (int)min(count, sizeof(dev->name) - 1), buf); + return count; } static DEVICE_ATTR(name, S_IRUGO, show_name, store_name); @@ -327,7 +333,7 @@ Structure: struct bus_attribute { struct attribute attr; ssize_t (*show)(struct bus_type *, char * buf); - ssize_t (*store)(struct bus_type *, const char * buf); + ssize_t (*store)(struct bus_type *, const char * buf, size_t count); }; Declaring: diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 94677e7dcb1..ed7e5efc06d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -727,7 +727,6 @@ struct file_operations { ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); - int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -768,10 +767,7 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls - ioctl: called by the ioctl(2) system call - - unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not - require the BKL should use this method instead of the ioctl() above. + unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls are used on 64 bit kernels. |