diff options
author | Takashi Iwai <tiwai@suse.de> | 2009-03-09 15:21:57 +0100 |
---|---|---|
committer | Takashi Iwai <tiwai@suse.de> | 2009-03-09 15:21:57 +0100 |
commit | efdcd41b4e41560375bb34ad8b095bcaa1c3a93f (patch) | |
tree | 57200ef68bae530776c0c2621c51e9464e1e72f9 /Documentation/filesystems | |
parent | 33dbe6114047496a5b6fee0941c52dad41250043 (diff) | |
parent | 79c7cdd5441f5d3900c1632adcc8cd2bee35c8da (diff) |
Merge branch 'topic/vmaster-update' into topic/docbook-fix
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/nfs-rdma.txt | 4 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 316 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs-pci.txt | 13 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs.txt | 50 | ||||
-rw-r--r-- | Documentation/filesystems/ubifs.txt | 7 |
5 files changed, 72 insertions, 318 deletions
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt index 44bd766f2e5..85eaeaddd27 100644 --- a/Documentation/filesystems/nfs-rdma.txt +++ b/Documentation/filesystems/nfs-rdma.txt @@ -251,7 +251,7 @@ NFS/RDMA Setup Instruct the server to listen on the RDMA transport: - $ echo rdma 2050 > /proc/fs/nfsd/portlist + $ echo rdma 20049 > /proc/fs/nfsd/portlist - On the client system @@ -263,7 +263,7 @@ NFS/RDMA Setup Regardless of how the client was built (module or built-in), use this command to mount the NFS/RDMA server: - $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt + $ mount -o rdma,port=20049 <IPoIB-server-name-or-address>:/<export> /mnt To verify that the mount is using RDMA, run "cat /proc/mounts" and check the "proto" field for the given mount. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index d105eb45282..a87be42f821 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1371,292 +1371,8 @@ auto_msgmni default value is 1. 2.4 /proc/sys/vm - The virtual memory subsystem ----------------------------------------------- -The files in this directory can be used to tune the operation of the virtual -memory (VM) subsystem of the Linux kernel. - -vfs_cache_pressure ------------------- - -Controls the tendency of the kernel to reclaim the memory which is used for -caching of directory and inode objects. - -At the default value of vfs_cache_pressure=100 the kernel will attempt to -reclaim dentries and inodes at a "fair" rate with respect to pagecache and -swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer -to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 -causes the kernel to prefer to reclaim dentries and inodes. - -dirty_background_bytes ----------------------- - -Contains the amount of dirty memory at which the pdflush background writeback -daemon will start writeback. - -If dirty_background_bytes is written, dirty_background_ratio becomes a function -of its value (dirty_background_bytes / the amount of dirtyable system memory). - -dirty_background_ratio ----------------------- - -Contains, as a percentage of the dirtyable system memory (free pages + mapped -pages + file cache, not including locked pages and HugePages), the number of -pages at which the pdflush background writeback daemon will start writing out -dirty data. - -If dirty_background_ratio is written, dirty_background_bytes becomes a function -of its value (dirty_background_ratio * the amount of dirtyable system memory). - -dirty_bytes ------------ - -Contains the amount of dirty memory at which a process generating disk writes -will itself start writeback. - -If dirty_bytes is written, dirty_ratio becomes a function of its value -(dirty_bytes / the amount of dirtyable system memory). - -dirty_ratio ------------ - -Contains, as a percentage of the dirtyable system memory (free pages + mapped -pages + file cache, not including locked pages and HugePages), the number of -pages at which a process which is generating disk writes will itself start -writing out dirty data. - -If dirty_ratio is written, dirty_bytes becomes a function of its value -(dirty_ratio * the amount of dirtyable system memory). - -dirty_writeback_centisecs -------------------------- - -The pdflush writeback daemons will periodically wake up and write `old' data -out to disk. This tunable expresses the interval between those wakeups, in -100'ths of a second. - -Setting this to zero disables periodic writeback altogether. - -dirty_expire_centisecs ----------------------- - -This tunable is used to define when dirty data is old enough to be eligible -for writeout by the pdflush daemons. It is expressed in 100'ths of a second. -Data which has been dirty in-memory for longer than this interval will be -written out next time a pdflush daemon wakes up. - -highmem_is_dirtyable --------------------- - -Only present if CONFIG_HIGHMEM is set. - -This defaults to 0 (false), meaning that the ratios set above are calculated -as a percentage of lowmem only. This protects against excessive scanning -in page reclaim, swapping and general VM distress. - -Setting this to 1 can be useful on 32 bit machines where you want to make -random changes within an MMAPed file that is larger than your available -lowmem without causing large quantities of random IO. Is is safe if the -behavior of all programs running on the machine is known and memory will -not be otherwise stressed. - -legacy_va_layout ----------------- - -If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel -will use the legacy (2.4) layout for all processes. - -lowmem_reserve_ratio ---------------------- - -For some specialised workloads on highmem machines it is dangerous for -the kernel to allow process memory to be allocated from the "lowmem" -zone. This is because that memory could then be pinned via the mlock() -system call, or by unavailability of swapspace. - -And on large highmem machines this lack of reclaimable lowmem memory -can be fatal. - -So the Linux page allocator has a mechanism which prevents allocations -which _could_ use highmem from using too much lowmem. This means that -a certain amount of lowmem is defended from the possibility of being -captured into pinned user memory. - -(The same argument applies to the old 16 megabyte ISA DMA region. This -mechanism will also defend that region from allocations which could use -highmem or lowmem). - -The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is -in defending these lower zones. - -If you have a machine which uses highmem or ISA DMA and your -applications are using mlock(), or if you are running with no swap then -you probably should change the lowmem_reserve_ratio setting. - -The lowmem_reserve_ratio is an array. You can see them by reading this file. -- -% cat /proc/sys/vm/lowmem_reserve_ratio -256 256 32 -- -Note: # of this elements is one fewer than number of zones. Because the highest - zone's value is not necessary for following calculation. - -But, these values are not used directly. The kernel calculates # of protection -pages for each zones from them. These are shown as array of protection pages -in /proc/zoneinfo like followings. (This is an example of x86-64 box). -Each zone has an array of protection pages like this. - -- -Node 0, zone DMA - pages free 1355 - min 3 - low 3 - high 4 - : - : - numa_other 0 - protection: (0, 2004, 2004, 2004) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - pagesets - cpu: 0 pcp: 0 - : -- -These protections are added to score to judge whether this zone should be used -for page allocation or should be reclaimed. - -In this example, if normal pages (index=2) are required to this DMA zone and -pages_high is used for watermark, the kernel judges this zone should not be -used because pages_free(1355) is smaller than watermark + protection[2] -(4 + 2004 = 2008). If this protection value is 0, this zone would be used for -normal page requirement. If requirement is DMA zone(index=0), protection[0] -(=0) is used. - -zone[i]'s protection[j] is calculated by following expression. - -(i < j): - zone[i]->protection[j] - = (total sums of present_pages from zone[i+1] to zone[j] on the node) - / lowmem_reserve_ratio[i]; -(i = j): - (should not be protected. = 0; -(i > j): - (not necessary, but looks 0) - -The default values of lowmem_reserve_ratio[i] are - 256 (if zone[i] means DMA or DMA32 zone) - 32 (others). -As above expression, they are reciprocal number of ratio. -256 means 1/256. # of protection pages becomes about "0.39%" of total present -pages of higher zones on the node. - -If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). - -page-cluster ------------- - -page-cluster controls the number of pages which are written to swap in -a single attempt. The swap I/O size. - -It is a logarithmic value - setting it to zero means "1 page", setting -it to 1 means "2 pages", setting it to 2 means "4 pages", etc. - -The default value is three (eight pages at a time). There may be some -small benefits in tuning this to a different value if your workload is -swap-intensive. - -overcommit_memory ------------------ - -Controls overcommit of system memory, possibly allowing processes -to allocate (but not use) more memory than is actually available. - - -0 - Heuristic overcommit handling. Obvious overcommits of - address space are refused. Used for a typical system. It - ensures a seriously wild allocation fails while allowing - overcommit to reduce swap usage. root is allowed to - allocate slightly more memory in this mode. This is the - default. - -1 - Always overcommit. Appropriate for some scientific - applications. - -2 - Don't overcommit. The total address space commit - for the system is not permitted to exceed swap plus a - configurable percentage (default is 50) of physical RAM. - Depending on the percentage you use, in most situations - this means a process will not be killed while attempting - to use already-allocated memory but will receive errors - on memory allocation as appropriate. - -overcommit_ratio ----------------- - -Percentage of physical memory size to include in overcommit calculations -(see above.) - -Memory allocation limit = swapspace + physmem * (overcommit_ratio / 100) - - swapspace = total size of all swap areas - physmem = size of physical memory in system - -nr_hugepages and hugetlb_shm_group ----------------------------------- - -nr_hugepages configures number of hugetlb page reserved for the system. - -hugetlb_shm_group contains group id that is allowed to create SysV shared -memory segment using hugetlb page. - -hugepages_treat_as_movable --------------------------- - -This parameter is only useful when kernelcore= is specified at boot time to -create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages -are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero -value written to hugepages_treat_as_movable allows huge pages to be allocated -from ZONE_MOVABLE. - -Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge -pages pool can easily grow or shrink within. Assuming that applications are -not running that mlock() a lot of memory, it is likely the huge pages pool -can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value -into nr_hugepages and triggering page reclaim. - -laptop_mode ------------ - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt. - -block_dump ----------- - -block_dump enables block I/O debugging when set to a nonzero value. More -information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. - -swap_token_timeout ------------------- - -This file contains valid hold time of swap out protection token. The Linux -VM has token based thrashing control mechanism and uses the token to prevent -unnecessary page faults in thrashing situation. The unit of the value is -second. The value would be useful to tune thrashing behavior. - -drop_caches ------------ - -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. - -To free pagecache: - echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: - echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: - echo 3 > /proc/sys/vm/drop_caches - -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +Please see: Documentation/sysctls/vm.txt for a description of these +entries. 2.5 /proc/sys/dev - Device specific parameters @@ -2311,6 +2027,34 @@ increase the likelihood of this process being killed by the oom-killer. Valid values are in the range -16 to +15, plus the special value -17, which disables oom-killing altogether for this process. +The process to be killed in an out-of-memory situation is selected among all others +based on its badness score. This value equals the original memory size of the process +and is then updated according to its CPU time (utime + stime) and the +run time (uptime - start time). The longer it runs the smaller is the score. +Badness score is divided by the square root of the CPU time and then by +the double square root of the run time. + +Swapped out tasks are killed first. Half of each child's memory size is added to +the parent's score if they do not share the same memory. Thus forking servers +are the prime candidates to be killed. Having only one 'hungry' child will make +parent less preferable than the child. + +/proc/<pid>/oom_score shows process' current badness score. + +The following heuristics are then applied: + * if the task was reniced, its score doubles + * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE + or CAP_SYS_RAWIO) have their score divided by 4 + * if oom condition happened in one cpuset and checked task does not belong + to it, its score is divided by 8 + * the resulting score is multiplied by two to the power of oom_adj, i.e. + points <<= oom_adj when it is positive and + points >>= -(oom_adj) otherwise + +The task with the highest badness score is then selected and its children +are killed, process itself will be killed in an OOM situation when it does +not have children or some of them disabled oom like described above. + 2.13 /proc/<pid>/oom_score - Display current oom-killer score ------------------------------------------------------------- diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 68ef48839c0..9f8740ca3f3 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -9,6 +9,7 @@ that support it. For example, a given bus might look like this: | |-- class | |-- config | |-- device + | |-- enable | |-- irq | |-- local_cpus | |-- resource @@ -32,6 +33,7 @@ files, each with their own function. class PCI class (ascii, ro) config PCI config space (binary, rw) device PCI device (ascii, ro) + enable Whether the device is enabled (ascii, rw) irq IRQ number (ascii, ro) local_cpus nearby CPU mask (cpumask, ro) resource PCI resource host addresses (ascii, ro) @@ -57,10 +59,19 @@ used to do actual device programming from userspace. Note that some platforms don't support mmapping of certain resources, so be sure to check the return value from any attempted mmap. +The 'enable' file provides a counter that indicates how many times the device +has been enabled. If the 'enable' file currently returns '4', and a '1' is +echoed into it, it will then return '5'. Echoing a '0' into it will decrease +the count. Even when it returns to 0, though, some of the initialisation +may not be reversed. + The 'rom' file is special in that it provides read-only access to the device's ROM file, if available. It's disabled by default, however, so applications should write the string "1" to the file to enable it before attempting a read -call, and disable it following the access by writing "0" to the file. +call, and disable it following the access by writing "0" to the file. Note +that the device must be enabled for a rom read to return data succesfully. +In the event a driver is not bound to the device, it can be enabled using the +'enable' file, documented above. Accessing legacy resources through sysfs ---------------------------------------- diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 9e9c348275a..7e81e37c0b1 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -2,8 +2,10 @@ sysfs - _The_ filesystem for exporting kernel objects. Patrick Mochel <mochel@osdl.org> +Mike Murphy <mamurph@cs.clemson.edu> -10 January 2003 +Revised: 22 February 2009 +Original: 10 January 2003 What it is: @@ -64,12 +66,13 @@ An attribute definition is simply: struct attribute { char * name; + struct module *owner; mode_t mode; }; -int sysfs_create_file(struct kobject * kobj, struct attribute * attr); -void sysfs_remove_file(struct kobject * kobj, struct attribute * attr); +int sysfs_create_file(struct kobject * kobj, const struct attribute * attr); +void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr); A bare attribute contains no means to read or write the value of the @@ -80,9 +83,11 @@ a specific object type. For example, the driver model defines struct device_attribute like: struct device_attribute { - struct attribute attr; - ssize_t (*show)(struct device * dev, char * buf); - ssize_t (*store)(struct device * dev, const char * buf); + struct attribute attr; + ssize_t (*show)(struct device *dev, struct device_attribute *attr, + char *buf); + ssize_t (*store)(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); }; int device_create_file(struct device *, struct device_attribute *); @@ -90,12 +95,8 @@ void device_remove_file(struct device *, struct device_attribute *); It also defines this helper for defining device attributes: -#define DEVICE_ATTR(_name, _mode, _show, _store) \ -struct device_attribute dev_attr_##_name = { \ - .attr = {.name = __stringify(_name) , .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -}; +#define DEVICE_ATTR(_name, _mode, _show, _store) \ +struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) For example, declaring @@ -107,9 +108,9 @@ static struct device_attribute dev_attr_foo = { .attr = { .name = "foo", .mode = S_IWUSR | S_IRUGO, + .show = show_foo, + .store = store_foo, }, - .show = show_foo, - .store = store_foo, }; @@ -161,10 +162,12 @@ To read or write attributes, show() or store() methods must be specified when declaring the attribute. The method types should be as simple as those defined for device attributes: - ssize_t (*show)(struct device * dev, char * buf); - ssize_t (*store)(struct device * dev, const char * buf); +ssize_t (*show)(struct device * dev, struct device_attribute * attr, + char * buf); +ssize_t (*store)(struct device * dev, struct device_attribute * attr, + const char * buf); -IOW, they should take only an object and a buffer as parameters. +IOW, they should take only an object, an attribute, and a buffer as parameters. sysfs allocates a buffer of size (PAGE_SIZE) and passes it to the @@ -299,14 +302,16 @@ The following interface layers currently exist in sysfs: Structure: struct device_attribute { - struct attribute attr; - ssize_t (*show)(struct device * dev, char * buf); - ssize_t (*store)(struct device * dev, const char * buf); + struct attribute attr; + ssize_t (*show)(struct device *dev, struct device_attribute *attr, + char *buf); + ssize_t (*store)(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); }; Declaring: -DEVICE_ATTR(_name, _str, _mode, _show, _store); +DEVICE_ATTR(_name, _mode, _show, _store); Creation/Removal: @@ -342,7 +347,8 @@ Structure: struct driver_attribute { struct attribute attr; ssize_t (*show)(struct device_driver *, char * buf); - ssize_t (*store)(struct device_driver *, const char * buf); + ssize_t (*store)(struct device_driver *, const char * buf, + size_t count); }; Declaring: diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index 84da2a4ba25..12fedb7834c 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -79,13 +79,6 @@ Mount options (*) == default. -norm_unmount (*) commit on unmount; the journal is committed - when the file-system is unmounted so that the - next mount does not have to replay the journal - and it becomes very fast; -fast_unmount do not commit on unmount; this option makes - unmount faster, but the next mount slower - because of the need to replay the journal. bulk_read read more in one go to take advantage of flash media that read faster sequentially no_bulk_read (*) do not bulk-read |