diff options
Diffstat (limited to 'Documentation')
36 files changed, 1880 insertions, 460 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 867608ab3ca..10b5cd6c54a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -41,7 +41,7 @@ psdocs: $(PS) PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) pdfdocs: $(PDF) -HTML := $(patsubst %.xml, %.html, $(BOOKS)) +HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) htmldocs: $(HTML) MAN := $(patsubst %.xml, %.9, $(BOOKS)) @@ -152,6 +152,7 @@ quiet_cmd_db2man = MAN $@ @(which xmlto > /dev/null 2>&1) || \ (echo "*** You need to install xmlto ***"; \ exit 1) + $(Q)mkdir -p $(obj)/man $(call cmd,db2man) @touch $@ @@ -212,11 +213,7 @@ clean-files := $(DOCBOOKS) \ $(patsubst %.xml, %.9, $(DOCBOOKS)) \ $(C-procfs-example) -clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) - -#man put files in man subdir - traverse down -subdir- := man/ - +clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man # Declare the contents of the .PHONY variable as phony. We keep that # information in a variable se we can use it in if_changed and friends. diff --git a/Documentation/DocBook/man/Makefile b/Documentation/DocBook/man/Makefile deleted file mode 100644 index 4fb7ea0f7ac..00000000000 --- a/Documentation/DocBook/man/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -# Rules are put in Documentation/DocBook - -clean-files := *.9.gz *.sgml manpage.links manpage.refs diff --git a/Documentation/blackfin/00-INDEX b/Documentation/blackfin/00-INDEX new file mode 100644 index 00000000000..7cb3b356b24 --- /dev/null +++ b/Documentation/blackfin/00-INDEX @@ -0,0 +1,11 @@ +00-INDEX + - This file + +cache-lock.txt + - HOWTO for blackfin cache locking. + +cachefeatures.txt + - Supported cache features. + +Filesystems + - Requirements for mounting the root file system. diff --git a/Documentation/blackfin/Filesystems b/Documentation/blackfin/Filesystems new file mode 100644 index 00000000000..51260a1b803 --- /dev/null +++ b/Documentation/blackfin/Filesystems @@ -0,0 +1,169 @@ +/* + * File: Documentation/blackfin/Filesystems + * Based on: + * Author: + * + * Created: + * Description: This file contains the simple DMA Implementation for Blackfin + * + * Rev: $Id: Filesystems 2384 2006-11-01 04:12:43Z magicyang $ + * + * Modified: + * Copyright 2004-2006 Analog Devices Inc. + * + * Bugs: Enter bugs at http://blackfin.uclinux.org/ + * + */ + + How to mount the root file system in uClinux/Blackfin + ----------------------------------------------------- + +1 Mounting EXT3 File system. + ------------------------ + + Creating an EXT3 File system for uClinux/Blackfin: + + +Please follow the steps to form the EXT3 File system and mount the same as root +file system. + +a Make an ext3 file system as large as you want the final root file + system. + + mkfs.ext3 /dev/ram0 <your-rootfs-size-in-1k-blocks> + +b Mount this Empty file system on a free directory as: + + mount -t ext3 /dev/ram0 ./test + where ./test is the empty directory. + +c Copy your root fs directory that you have so carefully made over. + + cp -af /tmp/my_final_rootfs_files/* ./test + + (For ex: cp -af uClinux-dist/romfs/* ./test) + +d If you have done everything right till now you should be able to see + the required "root" dir's (that's etc, root, bin, lib, sbin...) + +e Now unmount the file system + + umount ./test + +f Create the root file system image. + + dd if=/dev/ram0 bs=1k count=<your-rootfs-size-in-1k-blocks> \ + > ext3fs.img + + +Now you have to tell the kernel that will be mounting this file system as +rootfs. +So do a make menuconfig under kernel and select the Ext3 journaling file system +support under File system --> submenu. + + +2. Mounting EXT2 File system. + ------------------------- + +By default the ext2 file system image will be created if you invoke make from +the top uClinux-dist directory. + + +3. Mounting CRAMFS File System + ---------------------------- + +To create a CRAMFS file system image execute the command + + mkfs.cramfs ./test cramfs.img + + where ./test is the target directory. + + +4. Mounting ROMFS File System + -------------------------- + +To create a ROMFS file system image execute the command + + genromfs -v -V "ROMdisk" -f romfs.img -d ./test + + where ./test is the target directory + + +5. Mounting the JFFS2 Filesystem + ----------------------------- + +To create a compressed JFFS filesystem (JFFS2), please execute the command + + mkfs.jffs2 -d ./test -o jffs2.img + + where ./test is the target directory. + +However, please make sure the following is in your kernel config. + +/* + * RAM/ROM/Flash chip drivers + */ +#define CONFIG_MTD_CFI 1 +#define CONFIG_MTD_ROM 1 +/* + * Mapping drivers for chip access + */ +#define CONFIG_MTD_COMPLEX_MAPPINGS 1 +#define CONFIG_MTD_BF533 1 +#undef CONFIG_MTD_UCLINUX + +Through the u-boot boot loader, use the jffs2.img in the corresponding +partition made in linux-2.6.x/drivers/mtd/maps/bf533_flash.c. + +NOTE - Currently the Flash driver is available only for EZKIT. Watch out for a + STAMP driver soon. + + +6. Mounting the NFS File system + ----------------------------- + + For mounting the NFS please do the following in the kernel config. + + In Networking Support --> Networking options --> TCP/IP networking --> + IP: kernel level autoconfiguration + + Enable BOOTP Support. + + In Kernel hacking --> Compiled-in kernel boot parameter add the following + + root=/dev/nfs rw ip=bootp + + In File system --> Network File system, Enable + + NFS file system support --> NFSv3 client support + Root File system on NFS + + in uClibc menuconfig, do the following + In Networking Support + enable Remote Procedure Call (RPC) support + Full RPC Support + + On the Host side, ensure that /etc/dhcpd.conf looks something like this + + ddns-update-style ad-hoc; + allow bootp; + subnet 10.100.4.0 netmask 255.255.255.0 { + default-lease-time 122209600; + max-lease-time 31557600; + group { + host bf533 { + hardware ethernet 00:CF:52:49:C3:01; + fixed-address 10.100.4.50; + option root-path "/home/nfsmount"; + } + } + + ensure that /etc/exports looks something like this + /home/nfsmount *(rw,no_root_squash,no_all_squash) + + run the following commands as root (may differ depending on your + distribution) : + - service nfs start + - service portmap start + - service dhcpd start + - /usr/sbin/exportfs diff --git a/Documentation/blackfin/cache-lock.txt b/Documentation/blackfin/cache-lock.txt new file mode 100644 index 00000000000..88ba1e6c31c --- /dev/null +++ b/Documentation/blackfin/cache-lock.txt @@ -0,0 +1,48 @@ +/* + * File: Documentation/blackfin/cache-lock.txt + * Based on: + * Author: + * + * Created: + * Description: This file contains the simple DMA Implementation for Blackfin + * + * Rev: $Id: cache-lock.txt 2384 2006-11-01 04:12:43Z magicyang $ + * + * Modified: + * Copyright 2004-2006 Analog Devices Inc. + * + * Bugs: Enter bugs at http://blackfin.uclinux.org/ + * + */ + +How to lock your code in cache in uClinux/blackfin +-------------------------------------------------- + +There are only a few steps required to lock your code into the cache. +Currently you can lock the code by Way. + +Below are the interface provided for locking the cache. + + +1. cache_grab_lock(int Ways); + +This function grab the lock for locking your code into the cache specified +by Ways. + + +2. cache_lock(int Ways); + +This function should be called after your critical code has been executed. +Once the critical code exits, the code is now loaded into the cache. This +function locks the code into the cache. + + +So, the example sequence will be: + + cache_grab_lock(WAY0_L); /* Grab the lock */ + + critical_code(); /* Execute the code of interest */ + + cache_lock(WAY0_L); /* Lock the cache */ + +Where WAY0_L signifies WAY0 locking. diff --git a/Documentation/blackfin/cachefeatures.txt b/Documentation/blackfin/cachefeatures.txt new file mode 100644 index 00000000000..0fbec23becb --- /dev/null +++ b/Documentation/blackfin/cachefeatures.txt @@ -0,0 +1,65 @@ +/* + * File: Documentation/blackfin/cachefeatures.txt + * Based on: + * Author: + * + * Created: + * Description: This file contains the simple DMA Implementation for Blackfin + * + * Rev: $Id: cachefeatures.txt 2384 2006-11-01 04:12:43Z magicyang $ + * + * Modified: + * Copyright 2004-2006 Analog Devices Inc. + * + * Bugs: Enter bugs at http://blackfin.uclinux.org/ + * + */ + + - Instruction and Data cache initialization. + icache_init(); + dcache_init(); + + - Instruction and Data cache Invalidation Routines, when flushing the + same is not required. + _icache_invalidate(); + _dcache_invalidate(); + + Also, for invalidating the entire instruction and data cache, the below + routines are provided (another method for invalidation, refer page no 267 and 287 of + ADSP-BF533 Hardware Reference manual) + + invalidate_entire_dcache(); + invalidate_entire_icache(); + + -External Flushing of Instruction and data cache routines. + + flush_instruction_cache(); + flush_data_cache(); + + - Internal Flushing of Instruction and Data Cache. + + icplb_flush(); + dcplb_flush(); + + - Locking the cache. + + cache_grab_lock(); + cache_lock(); + + Please refer linux-2.6.x/Documentation/blackfin/cache-lock.txt for how to + lock the cache. + + Locking the cache is optional feature. + + - Miscellaneous cache functions. + + flush_cache_all(); + flush_cache_mm(); + invalidate_dcache_range(); + flush_dcache_range(); + flush_dcache_page(); + flush_cache_range(); + flush_cache_page(); + invalidate_dcache_range(); + flush_page_to_ram(); + diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 63c2d0c55aa..64e9f6c4826 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -55,8 +55,8 @@ aic7*seq.h* aicasm aicdb.h* asm -asm-offsets.* -asm_offsets.* +asm-offsets.h +asm_offsets.h autoconf.h* bbootsect bin2c diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index 5163b85308f..6c8d8f27db3 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -182,7 +182,7 @@ For example, you can do something like the following. ... - devres_close_group(dev, my_midlayer_something); + devres_close_group(dev, my_midlayer_create_something); return 0; } diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 5c88ba1ea26..5f96cb33743 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -117,13 +117,6 @@ Who: Adrian Bunk <bunk@stusta.de> --------------------------- -What: pci_module_init(driver) -When: January 2007 -Why: Is replaced by pci_register_driver(pci_driver). -Who: Richard Knutsson <ricknu-0@student.ltu.se> and Greg Kroah-Hartman <gregkh@suse.de> - ---------------------------- - What: Usage of invalid timevals in setitimer When: March 2007 Why: POSIX requires to validate timevals in the setitimer call. This @@ -190,18 +183,10 @@ Who: Jean Delvare <khali@linux-fr.org> --------------------------- -What: i2c_adapter.dev - i2c_adapter.list +What: i2c_adapter.list When: July 2007 -Why: Superfluous, given i2c_adapter.class_dev: - * The "dev" was a stand-in for the physical device node that legacy - drivers would not have; but now it's almost always present. Any - remaining legacy drivers must upgrade (they now trigger warnings). - * The "list" duplicates class device children. - The delay in removing this is so upgraded lm_sensors and libsensors - can get deployed. (Removal causes minor changes in the sysfs layout, - notably the location of the adapter type name and parenting the i2c - client hardware directly from their controller.) +Why: Superfluous, this list duplicates the one maintained by the driver + core. Who: Jean Delvare <khali@linux-fr.org>, David Brownell <dbrownell@users.sourceforge.net> @@ -314,3 +299,27 @@ Why: Code was merged, then submitter immediately disappeared leaving Who: David S. Miller <davem@davemloft.net> --------------------------- + +What: read_dev_chars(), read_conf_data{,_lpm}() (s390 common I/O layer) +When: December 2007 +Why: These functions are a leftover from 2.4 times. They have several + problems: + - Duplication of checks that are done in the device driver's + interrupt handler + - common I/O layer can't do device specific error recovery + - device driver can't be notified for conditions happening during + execution of the function + Device drivers should issue the read device characteristics and read + configuration data ccws and do the appropriate error handling + themselves. +Who: Cornelia Huck <cornelia.huck@de.ibm.com> + +--------------------------- + +What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers +When: September 2007 +Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific + I2C-over-GPIO drivers. +Who: Jean Delvare <khali@linux-fr.org> + +--------------------------- diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 7aaf09b86a5..3f4b226572e 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -122,21 +122,22 @@ subdirectory has the entries listed in Table 1-1. Table 1-1: Process specific entries in /proc .............................................................................. - File Content - cmdline Command line arguments - cpu Current and last cpu in which it was executed (2.4)(smp) - cwd Link to the current working directory - environ Values of environment variables - exe Link to the executable of this process - fd Directory, which contains all file descriptors - maps Memory maps to executables and library files (2.4) - mem Memory held by this process - root Link to the root directory of this process - stat Process status - statm Process memory status information - status Process status in human readable form - wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan - smaps Extension based on maps, presenting the rss size for each mapped file + File Content + clear_refs Clears page referenced bits shown in smaps output + cmdline Command line arguments + cpu Current and last cpu in which it was executed (2.4)(smp) + cwd Link to the current working directory + environ Values of environment variables + exe Link to the executable of this process + fd Directory, which contains all file descriptors + maps Memory maps to executables and library files (2.4) + mem Memory held by this process + root Link to the root directory of this process + stat Process status + statm Process memory status information + status Process status in human readable form + wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan + smaps Extension based on maps, the rss size for each mapped file .............................................................................. For example, to get the status information of a process, all you have to do is diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2 index 7f61fbc03f7..fae3495bcba 100644 --- a/Documentation/i2c/busses/i2c-nforce2 +++ b/Documentation/i2c/busses/i2c-nforce2 @@ -9,6 +9,8 @@ Supported adapters: * nForce4 MCP-04 10de:0034 * nForce4 MCP51 10de:0264 * nForce4 MCP55 10de:0368 + * nForce4 MCP61 10de:03EB + * nForce4 MCP65 10de:0446 Datasheet: not publicly available, but seems to be similar to the AMD-8111 SMBus 2.0 adapter. diff --git a/Documentation/i2c/porting-clients b/Documentation/i2c/porting-clients index ca272b263a9..7bf82c08f6c 100644 --- a/Documentation/i2c/porting-clients +++ b/Documentation/i2c/porting-clients @@ -1,4 +1,4 @@ -Revision 6, 2005-11-20 +Revision 7, 2007-04-19 Jean Delvare <khali@linux-fr.org> Greg KH <greg@kroah.com> @@ -20,6 +20,10 @@ yours for best results. Technical changes: +* [Driver type] Any driver that was relying on i2c-isa has to be + converted to a proper isa, platform or pci driver. This is not + covered by this guide. + * [Includes] Get rid of "version.h" and <linux/i2c-proc.h>. Includes typically look like that: #include <linux/module.h> @@ -27,12 +31,10 @@ Technical changes: #include <linux/slab.h> #include <linux/jiffies.h> #include <linux/i2c.h> - #include <linux/i2c-isa.h> /* for ISA drivers */ #include <linux/hwmon.h> /* for hardware monitoring drivers */ #include <linux/hwmon-sysfs.h> #include <linux/hwmon-vid.h> /* if you need VRM support */ #include <linux/err.h> /* for class registration */ - #include <asm/io.h> /* if you have I/O operations */ Please respect this inclusion order. Some extra headers may be required for a given driver (e.g. "lm75.h"). @@ -69,20 +71,16 @@ Technical changes: sensors mailing list <lm-sensors@lm-sensors.org> by providing a patch to the Documentation/hwmon/sysfs-interface file. -* [Attach] For I2C drivers, the attach function should make sure - that the adapter's class has I2C_CLASS_HWMON (or whatever class is - suitable for your driver), using the following construct: +* [Attach] The attach function should make sure that the adapter's + class has I2C_CLASS_HWMON (or whatever class is suitable for your + driver), using the following construct: if (!(adapter->class & I2C_CLASS_HWMON)) return 0; - ISA-only drivers of course don't need this. Call i2c_probe() instead of i2c_detect(). * [Detect] As mentioned earlier, the flags parameter is gone. The type_name and client_name strings are replaced by a single name string, which will be filled with a lowercase, short string. - In i2c-only drivers, drop the i2c_is_isa_adapter check, it's - useless. Same for isa-only drivers, as the test would always be - true. Only hybrid drivers (which are quite rare) still need it. The labels used for error paths are reduced to the number needed. It is advised that the labels are given descriptive names such as exit and exit_free. Don't forget to properly set err before diff --git a/Documentation/i2c/summary b/Documentation/i2c/summary index 41dde877679..aea60bf7e8f 100644 --- a/Documentation/i2c/summary +++ b/Documentation/i2c/summary @@ -4,17 +4,23 @@ I2C and SMBus ============= I2C (pronounce: I squared C) is a protocol developed by Philips. It is a -slow two-wire protocol (10-400 kHz), but it suffices for many types of -devices. +slow two-wire protocol (variable speed, up to 400 kHz), with a high speed +extension (3.4 MHz). It provides an inexpensive bus for connecting many +types of devices with infrequent or low bandwidth communications needs. +I2C is widely used with embedded systems. Some systems use variants that +don't meet branding requirements, and so are not advertised as being I2C. -SMBus (System Management Bus) is a subset of the I2C protocol. Many -modern mainboards have a System Management Bus. There are a lot of -devices which can be connected to a SMBus; the most notable are modern -memory chips with EEPROM memories and chips for hardware monitoring. +SMBus (System Management Bus) is based on the I2C protocol, and is mostly +a subset of I2C protocols and signaling. Many I2C devices will work on an +SMBus, but some SMBus protocols add semantics beyond what is required to +achieve I2C branding. Modern PC mainboards rely on SMBus. The most common +devices connected through SMBus are RAM modules configured using I2C EEPROMs, +and hardware monitoring chips. -Because the SMBus is just a special case of the generalized I2C bus, we -can simulate the SMBus protocol on plain I2C busses. The reverse is -regretfully impossible. +Because the SMBus is mostly a subset of the generalized I2C bus, we can +use its protocols on many I2C systems. However, there are systems that don't +meet both SMBus and I2C electrical constraints; and others which can't +implement all the common SMBus protocol semantics or messages. Terminology @@ -29,6 +35,7 @@ When we talk about I2C, we use the following terms: An Algorithm driver contains general code that can be used for a whole class of I2C adapters. Each specific adapter driver depends on one algorithm driver. + A Driver driver (yes, this sounds ridiculous, sorry) contains the general code to access some type of device. Each detected device gets its own data in the Client structure. Usually, Driver and Client are more closely @@ -40,6 +47,10 @@ a separate Adapter and Algorithm driver), and drivers for your I2C devices in this package. See the lm_sensors project http://www.lm-sensors.nu for device drivers. +At this time, Linux only operates I2C (or SMBus) in master mode; you can't +use these APIs to make a Linux system behave as a slave/device, either to +speak a custom protocol or to emulate some other device. + Included Bus Drivers ==================== diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index fbcff96f4ca..3d8d36b0ad1 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients @@ -1,5 +1,5 @@ This is a small guide for those who want to write kernel drivers for I2C -or SMBus devices. +or SMBus devices, using Linux as the protocol host/master (not slave). To set up a driver, you need to do several things. Some are optional, and some things can be done slightly or completely different. Use this as a @@ -29,8 +29,16 @@ static struct i2c_driver foo_driver = { .driver = { .name = "foo", }, + + /* iff driver uses driver model ("new style") binding model: */ + .probe = foo_probe, + .remove = foo_remove, + + /* else, driver uses "legacy" binding model: */ .attach_adapter = foo_attach_adapter, .detach_client = foo_detach_client, + + /* these may be used regardless of the driver binding model */ .shutdown = foo_shutdown, /* optional */ .suspend = foo_suspend, /* optional */ .resume = foo_resume, /* optional */ @@ -40,7 +48,8 @@ static struct i2c_driver foo_driver = { The name field is the driver name, and must not contain spaces. It should match the module name (if the driver can be compiled as a module), although you can use MODULE_ALIAS (passing "foo" in this example) to add -another name for the module. +another name for the module. If the driver name doesn't match the module +name, the module won't be automatically loaded (hotplug/coldplug). All other fields are for call-back functions which will be explained below. @@ -65,16 +74,13 @@ An example structure is below. struct foo_data { struct i2c_client client; - struct semaphore lock; /* For ISA access in `sensors' drivers. */ - int sysctl_id; /* To keep the /proc directory entry for - `sensors' drivers. */ enum chips type; /* To keep the chips type for `sensors' drivers. */ /* Because the i2c bus is slow, it is often useful to cache the read information of a chip for some time (for example, 1 or 2 seconds). It depends of course on the device whether this is really worthwhile or even sensible. */ - struct semaphore update_lock; /* When we are reading lots of information, + struct mutex update_lock; /* When we are reading lots of information, another process should not update the below information */ char valid; /* != 0 if the following fields are valid. */ @@ -95,8 +101,7 @@ some obscure clients). But we need generic reading and writing routines. I have found it useful to define foo_read and foo_write function for this. For some cases, it will be easier to call the i2c functions directly, but many chips have some kind of register-value idea that can easily -be encapsulated. Also, some chips have both ISA and I2C interfaces, and -it useful to abstract from this (only for `sensors' drivers). +be encapsulated. The below functions are simple examples, and should not be copied literally. @@ -119,28 +124,101 @@ literally. return i2c_smbus_write_word_data(client,reg,value); } -For sensors code, you may have to cope with ISA registers too. Something -like the below often works. Note the locking! - - int foo_read_value(struct i2c_client *client, u8 reg) - { - int res; - if (i2c_is_isa_client(client)) { - down(&(((struct foo_data *) (client->data)) -> lock)); - outb_p(reg,client->addr + FOO_ADDR_REG_OFFSET); - res = inb_p(client->addr + FOO_DATA_REG_OFFSET); - up(&(((struct foo_data *) (client->data)) -> lock)); - return res; - } else - return i2c_smbus_read_byte_data(client,reg); - } - -Writing is done the same way. - Probing and attaching ===================== +The Linux I2C stack was originally written to support access to hardware +monitoring chips on PC motherboards, and thus it embeds some assumptions +that are more appropriate to SMBus (and PCs) than to I2C. One of these +assumptions is that most adapters and devices drivers support the SMBUS_QUICK +protocol to probe device presence. Another is that devices and their drivers +can be sufficiently configured using only such probe primitives. + +As Linux and its I2C stack became more widely used in embedded systems +and complex components such as DVB adapters, those assumptions became more +problematic. Drivers for I2C devices that issue interrupts need more (and +different) configuration information, as do drivers handling chip variants +that can't be distinguished by protocol probing, or which need some board +specific information to operate correctly. + +Accordingly, the I2C stack now has two models for associating I2C devices +with their drivers: the original "legacy" model, and a newer one that's +fully compatible with the Linux 2.6 driver model. These models do not mix, +since the "legacy" model requires drivers to create "i2c_client" device +objects after SMBus style probing, while the Linux driver model expects +drivers to be given such device objects in their probe() routines. + + +Standard Driver Model Binding ("New Style") +------------------------------------------- + +System infrastructure, typically board-specific initialization code or +boot firmware, reports what I2C devices exist. For example, there may be +a table, in the kernel or from the boot loader, identifying I2C devices +and linking them to board-specific configuration information about IRQs +and other wiring artifacts, chip type, and so on. That could be used to +create i2c_client objects for each I2C device. + +I2C device drivers using this binding model work just like any other +kind of driver in Linux: they provide a probe() method to bind to +those devices, and a remove() method to unbind. + + static int foo_probe(struct i2c_client *client); + static int foo_remove(struct i2c_client *client); + +Remember that the i2c_driver does not create those client handles. The +handle may be used during foo_probe(). If foo_probe() reports success +(zero not a negative status code) it may save the handle and use it until +foo_remove() returns. That binding model is used by most Linux drivers. + +Drivers match devices when i2c_client.driver_name and the driver name are +the same; this approach is used in several other busses that don't have +device typing support in the hardware. The driver and module name should +match, so hotplug/coldplug mechanisms will modprobe the driver. + + +Device Creation (Standard driver model) +--------------------------------------- + +If you know for a fact that an I2C device is connected to a given I2C bus, +you can instantiate that device by simply filling an i2c_board_info +structure with the device address and driver name, and calling +i2c_new_device(). This will create the device, then the driver core will +take care of finding the right driver and will call its probe() method. +If a driver supports different device types, you can specify the type you +want using the type field. You can also specify an IRQ and platform data +if needed. + +Sometimes you know that a device is connected to a given I2C bus, but you +don't know the exact address it uses. This happens on TV adapters for +example, where the same driver supports dozens of slightly different +models, and I2C device addresses change from one model to the next. In +that case, you can use the i2c_new_probed_device() variant, which is +similar to i2c_new_device(), except that it takes an additional list of +possible I2C addresses to probe. A device is created for the first +responsive address in the list. If you expect more than one device to be +present in the address range, simply call i2c_new_probed_device() that +many times. + +The call to i2c_new_device() or i2c_new_probed_device() typically happens +in the I2C bus driver. You may want to save the returned i2c_client +reference for later use. + + +Device Deletion (Standard driver model) +--------------------------------------- + +Each I2C device which has been created using i2c_new_device() or +i2c_new_probed_device() can be unregistered by calling +i2c_unregister_device(). If you don't call it explicitly, it will be +called automatically before the underlying I2C bus itself is removed, as a +device can't survive its parent in the device driver model. + + +Legacy Driver Binding Model +--------------------------- + Most i2c devices can be present on several i2c addresses; for some this is determined in hardware (by soldering some chip pins to Vcc or Ground), for others this can be changed in software (by writing to specific client @@ -157,13 +235,9 @@ detection algorithm. You do not have to use this parameter interface; but don't try to use function i2c_probe() if you don't. -NOTE: If you want to write a `sensors' driver, the interface is slightly - different! See below. - - -Probing classes ---------------- +Probing classes (Legacy model) +------------------------------ All parameters are given as lists of unsigned 16-bit integers. Lists are terminated by I2C_CLIENT_END. @@ -210,8 +284,8 @@ Note that you *have* to call the defined variable `normal_i2c', without any prefix! -Attaching to an adapter ------------------------ +Attaching to an adapter (Legacy model) +-------------------------------------- Whenever a new adapter is inserted, or for all adapters if the driver is being registered, the callback attach_adapter() is called. Now is the @@ -237,17 +311,13 @@ them (unless a `force' parameter was used). In addition, addresses that are already in use (by some other registered client) are skipped. -The detect client function --------------------------- +The detect client function (Legacy model) +----------------------------------------- The detect client function is called by i2c_probe. The `kind' parameter contains -1 for a probed detection, 0 for a forced detection, or a positive number for a forced detection with a chip type forced. -Below, some things are only needed if this is a `sensors' driver. Those -parts are between /* SENSORS ONLY START */ and /* SENSORS ONLY END */ -markers. - Returning an error different from -ENODEV in a detect function will cause the detection to stop: other addresses and adapters won't be scanned. This should only be done on fatal or internal errors, such as a memory @@ -256,64 +326,20 @@ shortage or i2c_attach_client failing. For now, you can ignore the `flags' parameter. It is there for future use. int foo_detect_client(struct i2c_adapter *adapter, int address, - unsigned short flags, int kind) + int kind) { int err = 0; int i; - struct i2c_client *new_client; + struct i2c_client *client; struct foo_data *data; - const char *client_name = ""; /* For non-`sensors' drivers, put the real - name here! */ + const char *name = ""; /* Let's see whether this adapter can support what we need. - Please substitute the things you need here! - For `sensors' drivers, add `! is_isa &&' to the if statement */ + Please substitute the things you need here! */ if (!i2c_check_functionality(adapter,I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_WRITE_BYTE)) goto ERROR0; - /* SENSORS ONLY START */ - const char *type_name = ""; - int is_isa = i2c_is_isa_adapter(adapter); - - /* Do this only if the chip can additionally be found on the ISA bus - (hybrid chip). */ - - if (is_isa) { - - /* Discard immediately if this ISA range is already used */ - /* FIXME: never use check_region(), only request_region() */ - if (check_region(address,FOO_EXTENT)) - goto ERROR0; - - /* Probe whether there is anything on this address. - Some example code is below, but you will have to adapt this - for your own driver */ - - if (kind < 0) /* Only if no force parameter was used */ { - /* We may need long timeouts at least for some chips. */ - #define REALLY_SLOW_IO - i = inb_p(address + 1); - if (inb_p(address + 2) != i) - goto ERROR0; - if (inb_p(address + 3) != i) - goto ERROR0; - if (inb_p(address + 7) != i) - goto ERROR0; - #undef REALLY_SLOW_IO - - /* Let's just hope nothing breaks here */ - i = inb_p(address + 5) & 0x7f; - outb_p(~i & 0x7f,address+5); - if ((inb_p(address + 5) & 0x7f) != (~i & 0x7f)) { - outb_p(i,address+5); - return 0; - } - } - } - - /* SENSORS ONLY END */ - /* OK. For now, we presume we have a valid client. We now create the client structure, even though we cannot fill it completely yet. But it allows us to access several i2c functions safely */ @@ -323,13 +349,12 @@ For now, you can ignore the `flags' parameter. It is there for future use. goto ERROR0; } - new_client = &data->client; - i2c_set_clientdata(new_client, data); + client = &data->client; + i2c_set_clientdata(client, data); - new_client->addr = address; - new_client->adapter = adapter; - new_client->driver = &foo_driver; - new_client->flags = 0; + client->addr = address; + client->adapter = adapter; + client->driver = &foo_driver; /* Now, we do the remaining detection. If no `force' parameter is used. */ @@ -337,19 +362,17 @@ For now, you can ignore the `flags' parameter. It is there for future use. parameter was used. */ if (kind < 0) { /* The below is of course bogus */ - if (foo_read(new_client,FOO_REG_GENERIC) != FOO_GENERIC_VALUE) + if (foo_read(client, FOO_REG_GENERIC) != FOO_GENERIC_VALUE) goto ERROR1; } - /* SENSORS ONLY START */ - /* Next, specific detection. This is especially important for `sensors' devices. */ /* Determine the chip type. Not needed if a `force_CHIPTYPE' parameter was used. */ if (kind <= 0) { - i = foo_read(new_client,FOO_REG_CHIPTYPE); + i = foo_read(client, FOO_REG_CHIPTYPE); if (i == FOO_TYPE_1) kind = chip1; /* As defined in the enum */ else if (i == FOO_TYPE_2) @@ -363,63 +386,31 @@ For now, you can ignore the `flags' parameter. It is there for future use. /* Now set the type and chip names */ if (kind == chip1) { - type_name = "chip1"; /* For /proc entry */ - client_name = "CHIP 1"; + name = "chip1"; } else if (kind == chip2) { - type_name = "chip2"; /* For /proc entry */ - client_name = "CHIP 2"; + name = "chip2"; } - /* Reserve the ISA region */ - if (is_isa) - request_region(address,FOO_EXTENT,type_name); - - /* SENSORS ONLY END */ - /* Fill in the remaining client fields. */ - strcpy(new_client->name,client_name); - - /* SENSORS ONLY BEGIN */ + strlcpy(client->name, name, I2C_NAME_SIZE); data->type = kind; - /* SENSORS ONLY END */ - - data->valid = 0; /* Only if you use this field */ - init_MUTEX(&data->update_lock); /* Only if you use this field */ + mutex_init(&data->update_lock); /* Only if you use this field */ /* Any other initializations in data must be done here too. */ - /* Tell the i2c layer a new client has arrived */ - if ((err = i2c_attach_client(new_client))) - goto ERROR3; - - /* SENSORS ONLY BEGIN */ - /* Register a new directory entry with module sensors. See below for - the `template' structure. */ - if ((i = i2c_register_entry(new_client, type_name, - foo_dir_table_template,THIS_MODULE)) < 0) { - err = i; - goto ERROR4; - } - data->sysctl_id = i; - - /* SENSORS ONLY END */ - /* This function can write default values to the client registers, if needed. */ - foo_init_client(new_client); + foo_init_client(client); + + /* Tell the i2c layer a new client has arrived */ + if ((err = i2c_attach_client(client))) + goto ERROR1; + return 0; /* OK, this is not exactly good programming practice, usually. But it is very code-efficient in this case. */ - ERROR4: - i2c_detach_client(new_client); - ERROR3: - ERROR2: - /* SENSORS ONLY START */ - if (is_isa) - release_region(address,FOO_EXTENT); - /* SENSORS ONLY END */ ERROR1: kfree(data); ERROR0: @@ -427,8 +418,8 @@ For now, you can ignore the `flags' parameter. It is there for future use. } -Removing the client -=================== +Removing the client (Legacy model) +================================== The detach_client call back function is called when a client should be removed. It may actually fail, but only when panicking. This code is @@ -436,22 +427,12 @@ much simpler than the attachment code, fortunately! int foo_detach_client(struct i2c_client *client) { - int err,i; - - /* SENSORS ONLY START */ - /* Deregister with the `i2c-proc' module. */ - i2c_deregister_entry(((struct lm78_data *)(client->data))->sysctl_id); - /* SENSORS ONLY END */ + int err; /* Try to detach the client from i2c space */ if ((err = i2c_detach_client(client))) return err; - /* HYBRID SENSORS CHIP ONLY START */ - if i2c_is_isa_client(client) - release_region(client->addr,LM78_EXTENT); - /* HYBRID SENSORS CHIP ONLY END */ - kfree(i2c_get_clientdata(client)); return 0; } @@ -464,45 +445,34 @@ When the kernel is booted, or when your foo driver module is inserted, you have to do some initializing. Fortunately, just attaching (registering) the driver module is usually enough. - /* Keep track of how far we got in the initialization process. If several - things have to initialized, and we fail halfway, only those things - have to be cleaned up! */ - static int __initdata foo_initialized = 0; - static int __init foo_init(void) { int res; - printk("foo version %s (%s)\n",FOO_VERSION,FOO_DATE); if ((res = i2c_add_driver(&foo_driver))) { printk("foo: Driver registration failed, module not inserted.\n"); - foo_cleanup(); return res; } - foo_initialized ++; return 0; } - void foo_cleanup(void) + static void __exit foo_cleanup(void) { - if (foo_initialized == 1) { - if ((res = i2c_del_driver(&foo_driver))) { - printk("foo: Driver registration failed, module not removed.\n"); - return; - } - foo_initialized --; - } + i2c_del_driver(&foo_driver); } /* Substitute your own name and email address */ MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl>" MODULE_DESCRIPTION("Driver for Barf Inc. Foo I2C devices"); + /* a few non-GPL license types are also allowed */ + MODULE_LICENSE("GPL"); + module_init(foo_init); module_exit(foo_cleanup); Note that some functions are marked by `__init', and some data structures -by `__init_data'. Hose functions and structures can be removed after +by `__initdata'. These functions and structures can be removed after kernel booting (or module loading) is completed. @@ -632,110 +602,7 @@ General purpose routines Below all general purpose routines are listed, that were not mentioned before. - /* This call returns a unique low identifier for each registered adapter, - * or -1 if the adapter was not registered. + /* This call returns a unique low identifier for each registered adapter. */ extern int i2c_adapter_id(struct i2c_adapter *adap); - -The sensors sysctl/proc interface -================================= - -This section only applies if you write `sensors' drivers. - -Each sensors driver creates a directory in /proc/sys/dev/sensors for each -registered client. The directory is called something like foo-i2c-4-65. -The sensors module helps you to do this as easily as possible. - -The template ------------- - -You will need to define a ctl_table template. This template will automatically -be copied to a newly allocated structure and filled in where necessary when -you call sensors_register_entry. - -First, I will give an example definition. - static ctl_table foo_dir_table_template[] = { - { FOO_SYSCTL_FUNC1, "func1", NULL, 0, 0644, NULL, &i2c_proc_real, - &i2c_sysctl_real,NULL,&foo_func }, - { FOO_SYSCTL_FUNC2, "func2", NULL, 0, 0644, NULL, &i2c_proc_real, - &i2c_sysctl_real,NULL,&foo_func }, - { FOO_SYSCTL_DATA, "data", NULL, 0, 0644, NULL, &i2c_proc_real, - &i2c_sysctl_real,NULL,&foo_data }, - { 0 } - }; - -In the above example, three entries are defined. They can either be -accessed through the /proc interface, in the /proc/sys/dev/sensors/* -directories, as files named func1, func2 and data, or alternatively -through the sysctl interface, in the appropriate table, with identifiers -FOO_SYSCTL_FUNC1, FOO_SYSCTL_FUNC2 and FOO_SYSCTL_DATA. - -The third, sixth and ninth parameters should always be NULL, and the -fourth should always be 0. The fifth is the mode of the /proc file; -0644 is safe, as the file will be owned by root:root. - -The seventh and eighth parameters should be &i2c_proc_real and -&i2c_sysctl_real if you want to export lists of reals (scaled -integers). You can also use your own function for them, as usual. -Finally, the last parameter is the call-back to gather the data -(see below) if you use the *_proc_real functions. - - -Gathering the data ------------------- - -The call back functions (foo_func and foo_data in the above example) -can be called in several ways; the operation parameter determines -what should be done: - - * If operation == SENSORS_PROC_REAL_INFO, you must return the - magnitude (scaling) in nrels_mag; - * If operation == SENSORS_PROC_REAL_READ, you must read information - from the chip and return it in results. The number of integers - to display should be put in nrels_mag; - * If operation == SENSORS_PROC_REAL_WRITE, you must write the - supplied information to the chip. nrels_mag will contain the number - of integers, results the integers themselves. - -The *_proc_real functions will display the elements as reals for the -/proc interface. If you set the magnitude to 2, and supply 345 for -SENSORS_PROC_REAL_READ, it would display 3.45; and if the user would -write 45.6 to the /proc file, it would be returned as 4560 for -SENSORS_PROC_REAL_WRITE. A magnitude may even be negative! - -An example function: - - /* FOO_FROM_REG and FOO_TO_REG translate between scaled values and - register values. Note the use of the read cache. */ - void foo_in(struct i2c_client *client, int operation, int ctl_name, - int *nrels_mag, long *results) - { - struct foo_data *data = client->data; - int nr = ctl_name - FOO_SYSCTL_FUNC1; /* reduce to 0 upwards */ - - if (operation == SENSORS_PROC_REAL_INFO) - *nrels_mag = 2; - else if (operation == SENSORS_PROC_REAL_READ) { - /* Update the readings cache (if necessary) */ - foo_update_client(client); - /* Get the readings from the cache */ - results[0] = FOO_FROM_REG(data->foo_func_base[nr]); - results[1] = FOO_FROM_REG(data->foo_func_more[nr]); - results[2] = FOO_FROM_REG(data->foo_func_readonly[nr]); - *nrels_mag = 2; - } else if (operation == SENSORS_PROC_REAL_WRITE) { - if (*nrels_mag >= 1) { - /* Update the cache */ - data->foo_base[nr] = FOO_TO_REG(results[0]); - /* Update the chip */ - foo_write_value(client,FOO_REG_FUNC_BASE(nr),data->foo_base[nr]); - } - if (*nrels_mag >= 2) { - /* Update the cache */ - data->foo_more[nr] = FOO_TO_REG(results[1]); - /* Update the chip */ - foo_write_value(client,FOO_REG_FUNC_MORE(nr),data->foo_more[nr]); - } - } - } diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt index 38fe1f03fb1..6498666ea33 100644 --- a/Documentation/i386/boot.txt +++ b/Documentation/i386/boot.txt @@ -2,7 +2,7 @@ ---------------------------- H. Peter Anvin <hpa@zytor.com> - Last update 2007-01-26 + Last update 2007-03-06 On the i386 platform, the Linux kernel uses a rather complicated boot convention. This has evolved partially due to historical aspects, as @@ -35,9 +35,13 @@ Protocol 2.03: (Kernel 2.4.18-pre1) Explicitly makes the highest possible initrd address available to the bootloader. Protocol 2.04: (Kernel 2.6.14) Extend the syssize field to four bytes. + Protocol 2.05: (Kernel 2.6.20) Make protected mode kernel relocatable. Introduce relocatable_kernel and kernel_alignment fields. +Protocol 2.06: (Kernel 2.6.22) Added a field that contains the size of + the boot command line + **** MEMORY LAYOUT @@ -133,6 +137,8 @@ Offset Proto Name Meaning 022C/4 2.03+ initrd_addr_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not +0235/3 N/A pad2 Unused +0238/4 2.06+ cmdline_size Maximum size of the kernel command line (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -233,6 +239,12 @@ filled out, however: if your ramdisk is exactly 131072 bytes long and this field is 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) + cmdline_size: + The maximum size of the command line without the terminating + zero. This means that the command line can contain at most + cmdline_size characters. With protocol version 2.05 and + earlier, the maximum size was 255. + **** THE KERNEL COMMAND LINE @@ -241,11 +253,10 @@ loader to communicate with the kernel. Some of its options are also relevant to the boot loader itself, see "special command line options" below. -The kernel command line is a null-terminated string currently up to -255 characters long, plus the final null. A string that is too long -will be automatically truncated by the kernel, a boot loader may allow -a longer command line to be passed to permit future kernels to extend -this limit. +The kernel command line is a null-terminated string. The maximum +length can be retrieved from the field cmdline_size. Before protocol +version 2.06, the maximum was 255 characters. A string that is too +long will be automatically truncated by the kernel. If the boot protocol version is 2.02 or later, the address of the kernel command line is given by the header field cmd_line_ptr (see diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt index 180e0689676..d9d523099bb 100644 --- a/Documentation/input/input-programming.txt +++ b/Documentation/input/input-programming.txt @@ -1,5 +1,3 @@ -$Id: input-programming.txt,v 1.4 2001/05/04 09:47:14 vojtech Exp $ - Programming input drivers ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -20,28 +18,51 @@ pressed or released a BUTTON_IRQ happens. The driver could look like: #include <asm/irq.h> #include <asm/io.h> +static struct input_dev *button_dev; + static void button_interrupt(int irq, void *dummy, struct pt_regs *fp) { - input_report_key(&button_dev, BTN_1, inb(BUTTON_PORT) & 1); - input_sync(&button_dev); + input_report_key(button_dev, BTN_1, inb(BUTTON_PORT) & 1); + input_sync(button_dev); } static int __init button_init(void) { + int error; + if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) { printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq); return -EBUSY; } - - button_dev.evbit[0] = BIT(EV_KEY); - button_dev.keybit[LONG(BTN_0)] = BIT(BTN_0); - - input_register_device(&button_dev); + + button_dev = input_allocate_device(); + if (!button_dev) { + printk(KERN_ERR "button.c: Not enough memory\n"); + error = -ENOMEM; + goto err_free_irq; + } + + button_dev->evbit[0] = BIT(EV_KEY); + button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); + + error = input_register_device(button_dev); + if (error) { + printk(KERN_ERR "button.c: Failed to register device\n"); + goto err_free_dev; + } + + return 0; + + err_free_dev: + input_free_device(button_dev); + err_free_irq: + free_irq(BUTTON_IRQ, button_interrupt); + return error; } static void __exit button_exit(void) { - input_unregister_device(&button_dev); + input_unregister_device(button_dev); free_irq(BUTTON_IRQ, button_interrupt); } @@ -58,17 +79,18 @@ In the _init function, which is called either upon module load or when booting the kernel, it grabs the required resources (it should also check for the presence of the device). -Then it sets the input bitfields. This way the device driver tells the other +Then it allocates a new input device structure with input_aloocate_device() +and sets up input bitfields. This way the device driver tells the other parts of the input systems what it is - what events can be generated or -accepted by this input device. Our example device can only generate EV_KEY type -events, and from those only BTN_0 event code. Thus we only set these two -bits. We could have used +accepted by this input device. Our example device can only generate EV_KEY +type events, and from those only BTN_0 event code. Thus we only set these +two bits. We could have used set_bit(EV_KEY, button_dev.evbit); set_bit(BTN_0, button_dev.keybit); as well, but with more than single bits the first approach tends to be -shorter. +shorter. Then the example driver registers the input device structure by calling @@ -76,16 +98,15 @@ Then the example driver registers the input device structure by calling This adds the button_dev structure to linked lists of the input driver and calls device handler modules _connect functions to tell them a new input -device has appeared. Because the _connect functions may call kmalloc(, -GFP_KERNEL), which can sleep, input_register_device() must not be called -from an interrupt or with a spinlock held. +device has appeared. input_register_device() may sleep and therefore must +not be called from an interrupt or with a spinlock held. While in use, the only used function of the driver is button_interrupt() which upon every interrupt from the button checks its state and reports it -via the +via the input_report_key() @@ -113,16 +134,10 @@ can use the open and close callback to know when it can stop polling or release the interrupt and when it must resume polling or grab the interrupt again. To do that, we would add this to our example driver: -int button_used = 0; - static int button_open(struct input_dev *dev) { - if (button_used++) - return 0; - if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) { printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq); - button_used--; return -EBUSY; } @@ -131,20 +146,21 @@ static int button_open(struct input_dev *dev) static void button_close(struct input_dev *dev) { - if (!--button_used) - free_irq(IRQ_AMIGA_VERTB, button_interrupt); + free_irq(IRQ_AMIGA_VERTB, button_interrupt); } static int __init button_init(void) { ... - button_dev.open = button_open; - button_dev.close = button_close; + button_dev->open = button_open; + button_dev->close = button_close; ... } -Note the button_used variable - we have to track how many times the open -function was called to know when exactly our device stops being used. +Note that input core keeps track of number of users for the device and +makes sure that dev->open() is called only when the first user connects +to the device and that dev->close() is called when the very last user +disconnects. Calls to both callbacks are serialized. The open() callback should return a 0 in case of success or any nonzero value in case of failure. The close() callback (which is void) must always succeed. @@ -175,7 +191,7 @@ set the corresponding bits and call the input_report_rel(struct input_dev *dev, int code, int value) -function. Events are generated only for nonzero value. +function. Events are generated only for nonzero value. However EV_ABS requires a little special care. Before calling input_register_device, you have to fill additional fields in the input_dev @@ -187,6 +203,10 @@ the ABS_X axis: button_dev.absfuzz[ABS_X] = 4; button_dev.absflat[ABS_X] = 8; +Or, you can just say: + + input_set_abs_params(button_dev, ABS_X, 0, 255, 4, 8); + This setting would be appropriate for a joystick X axis, with the minimum of 0, maximum of 255 (which the joystick *must* be able to reach, no problem if it sometimes reports more, but it must be able to always reach the min and @@ -197,14 +217,7 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean that the thing is precise and always returns to exactly the center position (if it has any). -1.4 The void *private field -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This field in the input structure can be used to point to any private data -structures in the input device driver, in case the driver handles more than -one device. You'll need it in the open and close callbacks. - -1.5 NBITS(), LONG(), BIT() +1.4 NBITS(), LONG(), BIT() ~~~~~~~~~~~~~~~~~~~~~~~~~~ These three macros from input.h help some bitfield computations: @@ -213,13 +226,9 @@ These three macros from input.h help some bitfield computations: LONG(x) - returns the index in the array in longs for bit x BIT(x) - returns the index in a long for bit x -1.6 The number, id* and name fields +1.5 The id* and name fields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The dev->number is assigned by the input system to the input device when it -is registered. It has no use except for identifying the device to the user -in system messages. - The dev->name should be set before registering the input device by the input device driver. It's a string like 'Generic button device' containing a user friendly name of the device. @@ -234,15 +243,25 @@ driver. The id and name fields can be passed to userland via the evdev interface. -1.7 The keycode, keycodemax, keycodesize fields +1.6 The keycode, keycodemax, keycodesize fields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -These two fields will be used for any input devices that report their data -as scancodes. If not all scancodes can be known by autodetection, they may -need to be set by userland utilities. The keycode array then is an array -used to map from scancodes to input system keycodes. The keycode max will -contain the size of the array and keycodesize the size of each entry in it -(in bytes). +These three fields should be used by input devices that have dense keymaps. +The keycode is an array used to map from scancodes to input system keycodes. +The keycode max should contain the size of the array and keycodesize the +size of each entry in it (in bytes). + +Userspace can query and alter current scancode to keycode mappings using +EVIOCGKEYCODE and EVIOCSKEYCODE ioctls on corresponding evdev interface. +When a device has all 3 aforementioned fields filled in, the driver may +rely on kernel's default implementation of setting and querying keycode +mappings. + +1.7 dev->getkeycode() and dev->setkeycode() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +getkeycode() and setkeycode() callbacks allow drivers to override default +keycode/keycodesize/keycodemax mapping mechanism provided by input core +and implement sparse keycode maps. 1.8 Key autorepeat ~~~~~~~~~~~~~~~~~~ @@ -266,7 +285,7 @@ direction - from the system to the input device driver. If your input device driver can handle these events, it has to set the respective bits in evbit, *and* also the callback routine: - button_dev.event = button_event; + button_dev->event = button_event; int button_event(struct input_dev *dev, unsigned int type, unsigned int code, int value); { diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt index 769ee05ee4d..1d247d59ad5 100644 --- a/Documentation/kbuild/modules.txt +++ b/Documentation/kbuild/modules.txt @@ -249,7 +249,7 @@ following files: --> filename: Makefile KERNELDIR := /lib/modules/`uname -r`/build all:: - $(MAKE) -C $KERNELDIR M=`pwd` $@ + $(MAKE) -C $(KERNELDIR) M=`pwd` $@ # Module specific targets genbin: diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 84c3bd05c63..38d7db3262c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -64,6 +64,7 @@ parameter is applicable: GENERIC_TIME The generic timeofday code is enabled. NFS Appropriate NFS support is enabled. OSS OSS sound support is enabled. + PV_OPS A paravirtualized kernel PARIDE The ParIDE subsystem is enabled. PARISC The PA-RISC architecture is enabled. PCI PCI bus support is enabled. @@ -695,8 +696,15 @@ and is between 256 and 4096 characters. It is defined in the file idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed See Documentation/ide.txt. - idle= [HW] - Format: idle=poll or idle=halt + idle= [X86] + Format: idle=poll or idle=mwait + Poll forces a polling idle loop that can slightly improves the performance + of waking up a idle CPU, but will use a lot of power and make the system + run hot. Not recommended. + idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose + to not use it because it doesn't save as much power as a normal idle + loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same + as idle=poll. ignore_loglevel [KNL] Ignore loglevel setting - this will print /all/ @@ -1157,6 +1165,11 @@ and is between 256 and 4096 characters. It is defined in the file nomce [IA-32] Machine Check Exception + noreplace-paravirt [IA-32,PV_OPS] Don't patch paravirt_ops + + noreplace-smp [IA-32,SMP] Don't replace SMP instructions + with UP alternatives + noresidual [PPC] Don't use residual data on PReP machines. noresume [SWSUSP] Disables resume and restores original swap @@ -1562,6 +1575,9 @@ and is between 256 and 4096 characters. It is defined in the file smart2= [HW] Format: <io1>[,<io2>[,...,<io8>]] + smp-alt-once [IA-32,SMP] On a hotplug CPU system, only + attempt to substitute SMP alternatives once at boot. + snd-ad1816a= [HW,ALSA] snd-ad1848= [HW,ALSA] @@ -1820,6 +1836,7 @@ and is between 256 and 4096 characters. It is defined in the file [USBHID] The interval which mice are to be polled at. vdso= [IA-32,SH] + vdso=2: enable compat VDSO (default with COMPAT_VDSO) vdso=1: enable VDSO (default) vdso=0: disable VDSO mapping diff --git a/Documentation/pci.txt b/Documentation/pci.txt index cdf2f3c0ab1..e2c9d0a0c43 100644 --- a/Documentation/pci.txt +++ b/Documentation/pci.txt @@ -124,10 +124,6 @@ initialization with a pointer to a structure describing the driver err_handler See Documentation/pci-error-recovery.txt - multithread_probe Enable multi-threaded probe/scan. Driver must - provide its own locking/syncronization for init - operations if this is enabled. - The ID table is an array of struct pci_device_id entries ending with an all-zero entry. Each entry consists of: @@ -163,9 +159,9 @@ echo "vendor device subvendor subdevice class class_mask driver_data" > \ /sys/bus/pci/drivers/{driver}/new_id All fields are passed in as hexadecimal values (no leading 0x). -Users need pass only as many fields as necessary: - o vendor, device, subvendor, and subdevice fields default - to PCI_ANY_ID (FFFFFFFF), +The vendor and device fields are mandatory, the others are optional. Users +need pass only as many optional fields as necessary: + o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) o class and classmask fields default to 0 o driver_data defaults to 0UL. @@ -549,8 +545,6 @@ pci_find_slot() Find pci_dev corresponding to given bus and pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3) pci_find_capability() Find specified capability in device's capability list. -pci_module_init() Inline helper function for ensuring correct - pci_driver initialization and error handling. pci_resource_start() Returns bus start address for a given PCI region pci_resource_end() Returns bus end address for a given PCI region pci_resource_len() Returns the byte length of a PCI region diff --git a/Documentation/pcmcia/driver.txt b/Documentation/pcmcia/driver.txt new file mode 100644 index 00000000000..0ac16792077 --- /dev/null +++ b/Documentation/pcmcia/driver.txt @@ -0,0 +1,30 @@ +PCMCIA Driver +------------- + + +sysfs +----- + +New PCMCIA IDs may be added to a device driver pcmcia_device_id table at +runtime as shown below: + +echo "match_flags manf_id card_id func_id function device_no \ +prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \ +/sys/bus/pcmcia/drivers/{driver}/new_id + +All fields are passed in as hexadecimal values (no leading 0x). +The meaning is described in the PCMCIA specification, the match_flags is +a bitwise or-ed combination from PCMCIA_DEV_ID_MATCH_* constants +defined in include/linux/mod_devicetable.h. + +Once added, the driver probe routine will be invoked for any unclaimed +PCMCIA device listed in its (newly updated) pcmcia_device_id list. + +A common use-case is to add a new device according to the manufacturer ID +and the card ID (form the manf_id and card_id file in the device tree). +For this, just use: + +echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \ + /sys/bus/pcmcia/drivers/{driver}/new_id + +after loading the driver. diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index 74311d7e0f3..fd5192a8fa8 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -18,17 +18,10 @@ states. /sys/power/disk controls the operating mode of the suspend-to-disk -mechanism. Suspend-to-disk can be handled in several ways. The -greatest distinction is who writes memory to disk - the firmware or -the kernel. If the firmware does it, we assume that it also handles -suspending the system. - -If the kernel does it, then we have three options for putting the system -to sleep - using the platform driver (e.g. ACPI or other PM -registers), powering off the system or rebooting the system (for -testing). The system will support either 'firmware' or 'platform', and -that is known a priori. But, the user may choose 'shutdown' or -'reboot' as alternatives. +mechanism. Suspend-to-disk can be handled in several ways. We have a +few options for putting the system to sleep - using the platform driver +(e.g. ACPI or other pm_ops), powering off the system or rebooting the +system (for testing). Additionally, /sys/power/disk can be used to turn on one of the two testing modes of the suspend-to-disk mechanism: 'testproc' or 'test'. If the @@ -41,19 +34,19 @@ for 5 seconds, resume devices, unfreeze tasks and enable nonboot CPUs. Then, we are able to look in the log messages and work out, for example, which code is being slow and which device drivers are misbehaving. -Reading from this file will display what the mode is currently set -to. Writing to this file will accept one of +Reading from this file will display all supported modes and the currently +selected one in brackets, for example - 'firmware' - 'platform' + [shutdown] reboot test testproc + +Writing to this file will accept one of + + 'platform' (only if the platform supports it) 'shutdown' 'reboot' 'testproc' 'test' -It will only change to 'firmware' or 'platform' if the system supports -it. - /sys/power/image_size controls the size of the image created by the suspend-to-disk mechanism. It can be written a string representing a non-negative integer that will be used as an upper diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt index b6a3cbf7e84..e00b099a4b8 100644 --- a/Documentation/power/pci.txt +++ b/Documentation/power/pci.txt @@ -203,7 +203,7 @@ resume Usage: -if (dev->driver && dev->driver->suspend) +if (dev->driver && dev->driver->resume) dev->driver->resume(dev) The resume callback may be called from any power state, and is always meant to diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 0931a330d36..34800cc521b 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -62,17 +62,18 @@ setup via another operating system for it to use. Despite the inconvenience, this method requires minimal work by the kernel, since the firmware will also handle restoring memory contents on resume. -If the kernel is responsible for persistently saving state, a mechanism -called 'swsusp' (Swap Suspend) is used to write memory contents to -free swap space. swsusp has some restrictive requirements, but should -work in most cases. Some, albeit outdated, documentation can be found -in Documentation/power/swsusp.txt. +For suspend-to-disk, a mechanism called swsusp called 'swsusp' (Swap +Suspend) is used to write memory contents to free swap space. +swsusp has some restrictive requirements, but should work in most +cases. Some, albeit outdated, documentation can be found in +Documentation/power/swsusp.txt. Alternatively, userspace can do most +of the actual suspend to disk work, see userland-swsusp.txt. Once memory state is written to disk, the system may either enter a low-power state (like ACPI S4), or it may simply power down. Powering down offers greater savings, and allows this mechanism to work on any system. However, entering a real low-power state allows the user to -trigger wake up events (e.g. pressing a key or opening a laptop lid). +trigger wake up events (e.g. pressing a key or opening a laptop lid). A transition from Suspend-to-Disk to the On state should take about 30 seconds, though it's typically a bit more with the current diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index 0761ff6c57e..c55bd5079b9 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -156,8 +156,7 @@ instead set the PF_NOFREEZE process flag when creating the thread (and be very careful). -Q: What is the difference between "platform", "shutdown" and -"firmware" in /sys/power/disk? +Q: What is the difference between "platform" and "shutdown"? A: @@ -166,11 +165,8 @@ shutdown: save state in linux, then tell bios to powerdown platform: save state in linux, then tell bios to powerdown and blink "suspended led" -firmware: tell bios to save state itself [needs BIOS-specific suspend - partition, and has very little to do with swsusp] - -"platform" is actually right thing to do, but "shutdown" is most -reliable. +"platform" is actually right thing to do where supported, but +"shutdown" is most reliable (except on ACPI systems). Q: I do not understand why you have such strong objections to idea of selective suspend. @@ -388,8 +384,8 @@ while the system is asleep, maintaining the connection, using true sleep modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the /sys/power/state file; write "standby" or "mem".) We've not seen any hardware that can use these modes through software suspend, although in -theory some systems might support "platform" or "firmware" modes that -won't break the USB connections. +theory some systems might support "platform" modes that won't break the +USB connections. Remember that it's always a bad idea to unplug a disk drive containing a mounted filesystem. That's true even when your system is asleep! The diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt index dc8e44fc650..2368e7e4a8c 100644 --- a/Documentation/scsi/aacraid.txt +++ b/Documentation/scsi/aacraid.txt @@ -37,7 +37,11 @@ Supported Cards/Chipsets 9005:0286:9005:029d Adaptec 2420SA (Intruder HP release) 9005:0286:9005:02ac Adaptec 1800 (Typhoon44) 9005:0285:9005:02b5 Adaptec 5445 (Voodoo44) + 9005:0285:15d9:02b5 SMC AOC-USAS-S4i + 9005:0285:15d9:02c9 SMC AOC-USAS-S4iR 9005:0285:9005:02b6 Adaptec 5805 (Voodoo80) + 9005:0285:15d9:02b6 SMC AOC-USAS-S8i + 9005:0285:15d9:02ca SMC AOC-USAS-S8iR 9005:0285:9005:02b7 Adaptec 5085 (Voodoo08) 9005:0285:9005:02bb Adaptec 3405 (Marauder40LP) 9005:0285:9005:02bc Adaptec 3805 (Marauder80LP) @@ -93,6 +97,9 @@ Supported Cards/Chipsets 9005:0286:9005:02ae (Aurora Lite ARK) 9005:0285:9005:02b0 (Sunrise Lake ARK) 9005:0285:9005:02b1 Adaptec (Voodoo 8 internal 8 external) + 9005:0285:108e:7aac SUN STK RAID REM (Voodoo44 Coyote) + 9005:0285:108e:0286 SUN SG-XPCIESAS-R-IN (Cougar) + 9005:0285:108e:0287 SUN SG-XPCIESAS-R-EX (Prometheus) People ------------------------- diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt index caf10b15518..88ef88b949f 100644 --- a/Documentation/scsi/ncr53c8xx.txt +++ b/Documentation/scsi/ncr53c8xx.txt @@ -562,11 +562,6 @@ if only one has a flaw for some SCSI feature, you can disable the support by the driver of this feature at linux start-up and enable this feature after boot-up only for devices that support it safely. -CONFIG_SCSI_NCR53C8XX_PROFILE_SUPPORT (default answer: n) - This option must be set for profiling information to be gathered - and printed out through the proc file system. This features may - impact performances. - CONFIG_SCSI_NCR53C8XX_IOMAPPED (default answer: n) Answer "y" if you suspect your mother board to not allow memory mapped I/O. May slow down performance a little. This option is required by diff --git a/Documentation/sh/clk.txt b/Documentation/sh/clk.txt new file mode 100644 index 00000000000..9aef710e9a4 --- /dev/null +++ b/Documentation/sh/clk.txt @@ -0,0 +1,32 @@ +Clock framework on SuperH architecture + +The framework on SH extends existing API by the function clk_set_rate_ex, +which prototype is as follows: + + clk_set_rate_ex (struct clk *clk, unsigned long rate, int algo_id) + +The algo_id parameter is used to specify algorithm used to recalculate clocks, +adjanced to clock, specified as first argument. It is assumed that algo_id==0 +means no changes to adjanced clock + +Internally, the clk_set_rate_ex forwards request to clk->ops->set_rate method, +if it is present in ops structure. The method should set the clock rate and adjust +all needed clocks according to the passed algo_id. +Exact values for algo_id are machine-dependend. For the sh7722, the following +values are defined: + + NO_CHANGE = 0, + IUS_N1_N1, /* I:U = N:1, U:Sh = N:1 */ + IUS_322, /* I:U:Sh = 3:2:2 */ + IUS_522, /* I:U:Sh = 5:2:2 */ + IUS_N11, /* I:U:Sh = N:1:1 */ + SB_N1, /* Sh:B = N:1 */ + SB3_N1, /* Sh:B3 = N:1 */ + SB3_32, /* Sh:B3 = 3:2 */ + SB3_43, /* Sh:B3 = 4:3 */ + SB3_54, /* Sh:B3 = 5:4 */ + BP_N1, /* B:P = N:1 */ + IP_N1 /* I:P = N:1 */ + +Each of these constants means relation between clocks that can be set via the FRQCR +register diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx index f9717fe9bd8..215e3b8e726 100644 --- a/Documentation/spi/pxa2xx +++ b/Documentation/spi/pxa2xx @@ -62,7 +62,7 @@ static struct resource pxa_spi_nssp_resources[] = { static struct pxa2xx_spi_master pxa_nssp_master_info = { .ssp_type = PXA25x_NSSP, /* Type of SSP */ - .clock_enable = CKEN9_NSSP, /* NSSP Peripheral clock */ + .clock_enable = CKEN_NSSP, /* NSSP Peripheral clock */ .num_chipselect = 1, /* Matches the number of chips attached to NSSP */ .enable_dma = 1, /* Enables NSSP DMA */ }; diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index e96a341eb7e..1d192565e18 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -197,11 +197,22 @@ and may not be fast. panic_on_oom -This enables or disables panic on out-of-memory feature. If this is set to 1, -the kernel panics when out-of-memory happens. If this is set to 0, the kernel -will kill some rogue process, called oom_killer. Usually, oom_killer can kill -rogue processes and system will survive. If you want to panic the system -rather than killing rogue processes, set this to 1. +This enables or disables panic on out-of-memory feature. -The default value is 0. +If this is set to 0, the kernel will kill some rogue process, +called oom_killer. Usually, oom_killer can kill rogue processes and +system will survive. + +If this is set to 1, the kernel panics when out-of-memory happens. +However, if a process limits using nodes by mempolicy/cpusets, +and those nodes become memory exhaustion status, one process +may be killed by oom-killer. No panic occurs in this case. +Because other nodes' memory may be free. This means system total status +may be not fatal yet. +If this is set to 2, the kernel panics compulsorily even on the +above-mentioned. + +The default value is 0. +1 and 2 are for failover of clustering. Please select either +according to your policy of failover. diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index d43aa9d3c10..ba328f25541 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -1,6 +1,6 @@ Linux Magic System Request Key Hacks Documentation for sysrq.c -Last update: 2007-JAN-06 +Last update: 2007-MAR-14 * What is the magic SysRq key? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -75,7 +75,7 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'f' - Will call oom_kill to kill a memory hog process. -'g' - Used by kgdb on ppc platforms. +'g' - Used by kgdb on ppc and sh platforms. 'h' - Will display help (actually any other key than those listed above will display help. but 'h' is easy to remember :-) diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt index d61f6e7865d..b18e86a2250 100644 --- a/Documentation/usb/usb-serial.txt +++ b/Documentation/usb/usb-serial.txt @@ -42,7 +42,7 @@ ConnectTech WhiteHEAT 4 port converter http://www.connecttech.com For any questions or problems with this driver, please contact - Stuart MacDonald at stuartm@connecttech.com + Connect Tech's Support Department at support@connecttech.com HandSpring Visor, Palm USB, and Clié USB driver diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c new file mode 100644 index 00000000000..41710ccf3a2 --- /dev/null +++ b/Documentation/vm/slabinfo.c @@ -0,0 +1,943 @@ +/* + * Slabinfo: Tool to get reports about slabs + * + * (C) 2007 sgi, Christoph Lameter <clameter@sgi.com> + * + * Compile by: + * + * gcc -o slabinfo slabinfo.c + */ +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <dirent.h> +#include <string.h> +#include <unistd.h> +#include <stdarg.h> +#include <getopt.h> +#include <regex.h> + +#define MAX_SLABS 500 +#define MAX_ALIASES 500 +#define MAX_NODES 1024 + +struct slabinfo { + char *name; + int alias; + int refs; + int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; + int hwcache_align, object_size, objs_per_slab; + int sanity_checks, slab_size, store_user, trace; + int order, poison, reclaim_account, red_zone; + unsigned long partial, objects, slabs; + int numa[MAX_NODES]; + int numa_partial[MAX_NODES]; +} slabinfo[MAX_SLABS]; + +struct aliasinfo { + char *name; + char *ref; + struct slabinfo *slab; +} aliasinfo[MAX_ALIASES]; + +int slabs = 0; +int aliases = 0; +int alias_targets = 0; +int highest_node = 0; + +char buffer[4096]; + +int show_alias = 0; +int show_slab = 0; +int skip_zero = 1; +int show_numa = 0; +int show_track = 0; +int show_first_alias = 0; +int validate = 0; +int shrink = 0; +int show_inverted = 0; +int show_single_ref = 0; +int show_totals = 0; +int sort_size = 0; + +int page_size; + +regex_t pattern; + +void fatal(const char *x, ...) +{ + va_list ap; + + va_start(ap, x); + vfprintf(stderr, x, ap); + va_end(ap); + exit(1); +} + +void usage(void) +{ + printf("slabinfo [-ahnpvtsz] [slab-regexp]\n" + "-a|--aliases Show aliases\n" + "-h|--help Show usage information\n" + "-n|--numa Show NUMA information\n" + "-s|--shrink Shrink slabs\n" + "-v|--validate Validate slabs\n" + "-t|--tracking Show alloc/free information\n" + "-T|--Totals Show summary information\n" + "-l|--slabs Show slabs\n" + "-S|--Size Sort by size\n" + "-z|--zero Include empty slabs\n" + "-f|--first-alias Show first alias\n" + "-i|--inverted Inverted list\n" + "-1|--1ref Single reference\n" + ); +} + +unsigned long read_obj(char *name) +{ + FILE *f = fopen(name, "r"); + + if (!f) + buffer[0] = 0; + else { + if (!fgets(buffer,sizeof(buffer), f)) + buffer[0] = 0; + fclose(f); + if (buffer[strlen(buffer)] == '\n') + buffer[strlen(buffer)] = 0; + } + return strlen(buffer); +} + + +/* + * Get the contents of an attribute + */ +unsigned long get_obj(char *name) +{ + if (!read_obj(name)) + return 0; + + return atol(buffer); +} + +unsigned long get_obj_and_str(char *name, char **x) +{ + unsigned long result = 0; + char *p; + + *x = NULL; + + if (!read_obj(name)) { + x = NULL; + return 0; + } + result = strtoul(buffer, &p, 10); + while (*p == ' ') + p++; + if (*p) + *x = strdup(p); + return result; +} + +void set_obj(struct slabinfo *s, char *name, int n) +{ + char x[100]; + + sprintf(x, "%s/%s", s->name, name); + + FILE *f = fopen(x, "w"); + + if (!f) + fatal("Cannot write to %s\n", x); + + fprintf(f, "%d\n", n); + fclose(f); +} + +/* + * Put a size string together + */ +int store_size(char *buffer, unsigned long value) +{ + unsigned long divisor = 1; + char trailer = 0; + int n; + + if (value > 1000000000UL) { + divisor = 100000000UL; + trailer = 'G'; + } else if (value > 1000000UL) { + divisor = 100000UL; + trailer = 'M'; + } else if (value > 1000UL) { + divisor = 100; + trailer = 'K'; + } + + value /= divisor; + n = sprintf(buffer, "%ld",value); + if (trailer) { + buffer[n] = trailer; + n++; + buffer[n] = 0; + } + if (divisor != 1) { + memmove(buffer + n - 2, buffer + n - 3, 4); + buffer[n-2] = '.'; + n++; + } + return n; +} + +void decode_numa_list(int *numa, char *t) +{ + int node; + int nr; + + memset(numa, 0, MAX_NODES * sizeof(int)); + + while (*t == 'N') { + t++; + node = strtoul(t, &t, 10); + if (*t == '=') { + t++; + nr = strtoul(t, &t, 10); + numa[node] = nr; + if (node > highest_node) + highest_node = node; + } + while (*t == ' ') + t++; + } +} + +void slab_validate(struct slabinfo *s) +{ + set_obj(s, "validate", 1); +} + +void slab_shrink(struct slabinfo *s) +{ + set_obj(s, "shrink", 1); +} + +int line = 0; + +void first_line(void) +{ + printf("Name Objects Objsize Space " + "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); +} + +/* + * Find the shortest alias of a slab + */ +struct aliasinfo *find_one_alias(struct slabinfo *find) +{ + struct aliasinfo *a; + struct aliasinfo *best = NULL; + + for(a = aliasinfo;a < aliasinfo + aliases; a++) { + if (a->slab == find && + (!best || strlen(best->name) < strlen(a->name))) { + best = a; + if (strncmp(a->name,"kmall", 5) == 0) + return best; + } + } + if (best) + return best; + fatal("Cannot find alias for %s\n", find->name); + return NULL; +} + +unsigned long slab_size(struct slabinfo *s) +{ + return s->slabs * (page_size << s->order); +} + + +void slabcache(struct slabinfo *s) +{ + char size_str[20]; + char dist_str[40]; + char flags[20]; + char *p = flags; + + if (skip_zero && !s->slabs) + return; + + store_size(size_str, slab_size(s)); + sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs); + + if (!line++) + first_line(); + + if (s->aliases) + *p++ = '*'; + if (s->cache_dma) + *p++ = 'd'; + if (s->hwcache_align) + *p++ = 'A'; + if (s->poison) + *p++ = 'P'; + if (s->reclaim_account) + *p++ = 'a'; + if (s->red_zone) + *p++ = 'Z'; + if (s->sanity_checks) + *p++ = 'F'; + if (s->store_user) + *p++ = 'U'; + if (s->trace) + *p++ = 'T'; + + *p = 0; + printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", + s->name, s->objects, s->object_size, size_str, dist_str, + s->objs_per_slab, s->order, + s->slabs ? (s->partial * 100) / s->slabs : 100, + s->slabs ? (s->objects * s->object_size * 100) / + (s->slabs * (page_size << s->order)) : 100, + flags); +} + +void slab_numa(struct slabinfo *s) +{ + int node; + + if (!highest_node) + fatal("No NUMA information available.\n"); + + if (skip_zero && !s->slabs) + return; + + if (!line) { + printf("\nSlab Node "); + for(node = 0; node <= highest_node; node++) + printf(" %4d", node); + printf("\n----------------------"); + for(node = 0; node <= highest_node; node++) + printf("-----"); + printf("\n"); + } + printf("%-21s ", s->name); + for(node = 0; node <= highest_node; node++) { + char b[20]; + + store_size(b, s->numa[node]); + printf(" %4s", b); + } + printf("\n"); + line++; +} + +void show_tracking(struct slabinfo *s) +{ + printf("\n%s: Calls to allocate a slab object\n", s->name); + printf("---------------------------------------------------\n"); + if (read_obj("alloc_calls")) + printf(buffer); + + printf("%s: Calls to free a slab object\n", s->name); + printf("-----------------------------------------------\n"); + if (read_obj("free_calls")) + printf(buffer); + +} + +void totals(void) +{ + struct slabinfo *s; + + int used_slabs = 0; + char b1[20], b2[20], b3[20], b4[20]; + unsigned long long max = 1ULL << 63; + + /* Object size */ + unsigned long long min_objsize = max, max_objsize = 0, avg_objsize; + + /* Number of partial slabs in a slabcache */ + unsigned long long min_partial = max, max_partial = 0, + avg_partial, total_partial = 0; + + /* Number of slabs in a slab cache */ + unsigned long long min_slabs = max, max_slabs = 0, + avg_slabs, total_slabs = 0; + + /* Size of the whole slab */ + unsigned long long min_size = max, max_size = 0, + avg_size, total_size = 0; + + /* Bytes used for object storage in a slab */ + unsigned long long min_used = max, max_used = 0, + avg_used, total_used = 0; + + /* Waste: Bytes used for alignment and padding */ + unsigned long long min_waste = max, max_waste = 0, + avg_waste, total_waste = 0; + /* Number of objects in a slab */ + unsigned long long min_objects = max, max_objects = 0, + avg_objects, total_objects = 0; + /* Waste per object */ + unsigned long long min_objwaste = max, + max_objwaste = 0, avg_objwaste, + total_objwaste = 0; + + /* Memory per object */ + unsigned long long min_memobj = max, + max_memobj = 0, avg_memobj, + total_objsize = 0; + + /* Percentage of partial slabs per slab */ + unsigned long min_ppart = 100, max_ppart = 0, + avg_ppart, total_ppart = 0; + + /* Number of objects in partial slabs */ + unsigned long min_partobj = max, max_partobj = 0, + avg_partobj, total_partobj = 0; + + /* Percentage of partial objects of all objects in a slab */ + unsigned long min_ppartobj = 100, max_ppartobj = 0, + avg_ppartobj, total_ppartobj = 0; + + + for (s = slabinfo; s < slabinfo + slabs; s++) { + unsigned long long size; + unsigned long used; + unsigned long long wasted; + unsigned long long objwaste; + long long objects_in_partial_slabs; + unsigned long percentage_partial_slabs; + unsigned long percentage_partial_objs; + + if (!s->slabs || !s->objects) + continue; + + used_slabs++; + + size = slab_size(s); + used = s->objects * s->object_size; + wasted = size - used; + objwaste = s->slab_size - s->object_size; + + objects_in_partial_slabs = s->objects - + (s->slabs - s->partial - s ->cpu_slabs) * + s->objs_per_slab; + + if (objects_in_partial_slabs < 0) + objects_in_partial_slabs = 0; + + percentage_partial_slabs = s->partial * 100 / s->slabs; + if (percentage_partial_slabs > 100) + percentage_partial_slabs = 100; + + percentage_partial_objs = objects_in_partial_slabs * 100 + / s->objects; + + if (percentage_partial_objs > 100) + percentage_partial_objs = 100; + + if (s->object_size < min_objsize) + min_objsize = s->object_size; + if (s->partial < min_partial) + min_partial = s->partial; + if (s->slabs < min_slabs) + min_slabs = s->slabs; + if (size < min_size) + min_size = size; + if (wasted < min_waste) + min_waste = wasted; + if (objwaste < min_objwaste) + min_objwaste = objwaste; + if (s->objects < min_objects) + min_objects = s->objects; + if (used < min_used) + min_used = used; + if (objects_in_partial_slabs < min_partobj) + min_partobj = objects_in_partial_slabs; + if (percentage_partial_slabs < min_ppart) + min_ppart = percentage_partial_slabs; + if (percentage_partial_objs < min_ppartobj) + min_ppartobj = percentage_partial_objs; + if (s->slab_size < min_memobj) + min_memobj = s->slab_size; + + if (s->object_size > max_objsize) + max_objsize = s->object_size; + if (s->partial > max_partial) + max_partial = s->partial; + if (s->slabs > max_slabs) + max_slabs = s->slabs; + if (size > max_size) + max_size = size; + if (wasted > max_waste) + max_waste = wasted; + if (objwaste > max_objwaste) + max_objwaste = objwaste; + if (s->objects > max_objects) + max_objects = s->objects; + if (used > max_used) + max_used = used; + if (objects_in_partial_slabs > max_partobj) + max_partobj = objects_in_partial_slabs; + if (percentage_partial_slabs > max_ppart) + max_ppart = percentage_partial_slabs; + if (percentage_partial_objs > max_ppartobj) + max_ppartobj = percentage_partial_objs; + if (s->slab_size > max_memobj) + max_memobj = s->slab_size; + + total_partial += s->partial; + total_slabs += s->slabs; + total_size += size; + total_waste += wasted; + + total_objects += s->objects; + total_used += used; + total_partobj += objects_in_partial_slabs; + total_ppart += percentage_partial_slabs; + total_ppartobj += percentage_partial_objs; + + total_objwaste += s->objects * objwaste; + total_objsize += s->objects * s->slab_size; + } + + if (!total_objects) { + printf("No objects\n"); + return; + } + if (!used_slabs) { + printf("No slabs\n"); + return; + } + + /* Per slab averages */ + avg_partial = total_partial / used_slabs; + avg_slabs = total_slabs / used_slabs; + avg_size = total_size / used_slabs; + avg_waste = total_waste / used_slabs; + + avg_objects = total_objects / used_slabs; + avg_used = total_used / used_slabs; + avg_partobj = total_partobj / used_slabs; + avg_ppart = total_ppart / used_slabs; + avg_ppartobj = total_ppartobj / used_slabs; + + /* Per object object sizes */ + avg_objsize = total_used / total_objects; + avg_objwaste = total_objwaste / total_objects; + avg_partobj = total_partobj * 100 / total_objects; + avg_memobj = total_objsize / total_objects; + + printf("Slabcache Totals\n"); + printf("----------------\n"); + printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n", + slabs, aliases, alias_targets, used_slabs); + + store_size(b1, total_size);store_size(b2, total_waste); + store_size(b3, total_waste * 100 / total_used); + printf("Memory used: %6s # Loss : %6s MRatio: %6s%%\n", b1, b2, b3); + + store_size(b1, total_objects);store_size(b2, total_partobj); + store_size(b3, total_partobj * 100 / total_objects); + printf("# Objects : %6s # PartObj: %6s ORatio: %6s%%\n", b1, b2, b3); + + printf("\n"); + printf("Per Cache Average Min Max Total\n"); + printf("---------------------------------------------------------\n"); + + store_size(b1, avg_objects);store_size(b2, min_objects); + store_size(b3, max_objects);store_size(b4, total_objects); + printf("#Objects %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_slabs);store_size(b2, min_slabs); + store_size(b3, max_slabs);store_size(b4, total_slabs); + printf("#Slabs %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_partial);store_size(b2, min_partial); + store_size(b3, max_partial);store_size(b4, total_partial); + printf("#PartSlab %10s %10s %10s %10s\n", + b1, b2, b3, b4); + store_size(b1, avg_ppart);store_size(b2, min_ppart); + store_size(b3, max_ppart); + store_size(b4, total_partial * 100 / total_slabs); + printf("%%PartSlab %10s%% %10s%% %10s%% %10s%%\n", + b1, b2, b3, b4); + + store_size(b1, avg_partobj);store_size(b2, min_partobj); + store_size(b3, max_partobj); + store_size(b4, total_partobj); + printf("PartObjs %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); + store_size(b3, max_ppartobj); + store_size(b4, total_partobj * 100 / total_objects); + printf("%% PartObj %10s%% %10s%% %10s%% %10s%%\n", + b1, b2, b3, b4); + + store_size(b1, avg_size);store_size(b2, min_size); + store_size(b3, max_size);store_size(b4, total_size); + printf("Memory %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_used);store_size(b2, min_used); + store_size(b3, max_used);store_size(b4, total_used); + printf("Used %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_waste);store_size(b2, min_waste); + store_size(b3, max_waste);store_size(b4, total_waste); + printf("Loss %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + printf("\n"); + printf("Per Object Average Min Max\n"); + printf("---------------------------------------------\n"); + + store_size(b1, avg_memobj);store_size(b2, min_memobj); + store_size(b3, max_memobj); + printf("Memory %10s %10s %10s\n", + b1, b2, b3); + store_size(b1, avg_objsize);store_size(b2, min_objsize); + store_size(b3, max_objsize); + printf("User %10s %10s %10s\n", + b1, b2, b3); + + store_size(b1, avg_objwaste);store_size(b2, min_objwaste); + store_size(b3, max_objwaste); + printf("Loss %10s %10s %10s\n", + b1, b2, b3); +} + +void sort_slabs(void) +{ + struct slabinfo *s1,*s2; + + for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) { + for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { + int result; + + if (sort_size) + result = slab_size(s1) < slab_size(s2); + else + result = strcasecmp(s1->name, s2->name); + + if (show_inverted) + result = -result; + + if (result > 0) { + struct slabinfo t; + + memcpy(&t, s1, sizeof(struct slabinfo)); + memcpy(s1, s2, sizeof(struct slabinfo)); + memcpy(s2, &t, sizeof(struct slabinfo)); + } + } + } +} + +void sort_aliases(void) +{ + struct aliasinfo *a1,*a2; + + for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) { + for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) { + char *n1, *n2; + + n1 = a1->name; + n2 = a2->name; + if (show_alias && !show_inverted) { + n1 = a1->ref; + n2 = a2->ref; + } + if (strcasecmp(n1, n2) > 0) { + struct aliasinfo t; + + memcpy(&t, a1, sizeof(struct aliasinfo)); + memcpy(a1, a2, sizeof(struct aliasinfo)); + memcpy(a2, &t, sizeof(struct aliasinfo)); + } + } + } +} + +void link_slabs(void) +{ + struct aliasinfo *a; + struct slabinfo *s; + + for (a = aliasinfo; a < aliasinfo + aliases; a++) { + + for(s = slabinfo; s < slabinfo + slabs; s++) + if (strcmp(a->ref, s->name) == 0) { + a->slab = s; + s->refs++; + break; + } + if (s == slabinfo + slabs) + fatal("Unresolved alias %s\n", a->ref); + } +} + +void alias(void) +{ + struct aliasinfo *a; + char *active = NULL; + + sort_aliases(); + link_slabs(); + + for(a = aliasinfo; a < aliasinfo + aliases; a++) { + + if (!show_single_ref && a->slab->refs == 1) + continue; + + if (!show_inverted) { + if (active) { + if (strcmp(a->slab->name, active) == 0) { + printf(" %s", a->name); + continue; + } + } + printf("\n%-20s <- %s", a->slab->name, a->name); + active = a->slab->name; + } + else + printf("%-20s -> %s\n", a->name, a->slab->name); + } + if (active) + printf("\n"); +} + + +void rename_slabs(void) +{ + struct slabinfo *s; + struct aliasinfo *a; + + for (s = slabinfo; s < slabinfo + slabs; s++) { + if (*s->name != ':') + continue; + + if (s->refs > 1 && !show_first_alias) + continue; + + a = find_one_alias(s); + + s->name = a->name; + } +} + +int slab_mismatch(char *slab) +{ + return regexec(&pattern, slab, 0, NULL, 0); +} + +void read_slab_dir(void) +{ + DIR *dir; + struct dirent *de; + struct slabinfo *slab = slabinfo; + struct aliasinfo *alias = aliasinfo; + char *p; + char *t; + int count; + + dir = opendir("."); + while ((de = readdir(dir))) { + if (de->d_name[0] == '.' || + slab_mismatch(de->d_name)) + continue; + switch (de->d_type) { + case DT_LNK: + alias->name = strdup(de->d_name); + count = readlink(de->d_name, buffer, sizeof(buffer)); + + if (count < 0) + fatal("Cannot read symlink %s\n", de->d_name); + + buffer[count] = 0; + p = buffer + count; + while (p > buffer && p[-1] != '/') + p--; + alias->ref = strdup(p); + alias++; + break; + case DT_DIR: + if (chdir(de->d_name)) + fatal("Unable to access slab %s\n", slab->name); + slab->name = strdup(de->d_name); + slab->alias = 0; + slab->refs = 0; + slab->aliases = get_obj("aliases"); + slab->align = get_obj("align"); + slab->cache_dma = get_obj("cache_dma"); + slab->cpu_slabs = get_obj("cpu_slabs"); + slab->destroy_by_rcu = get_obj("destroy_by_rcu"); + slab->hwcache_align = get_obj("hwcache_align"); + slab->object_size = get_obj("object_size"); + slab->objects = get_obj("objects"); + slab->objs_per_slab = get_obj("objs_per_slab"); + slab->order = get_obj("order"); + slab->partial = get_obj("partial"); + slab->partial = get_obj_and_str("partial", &t); + decode_numa_list(slab->numa_partial, t); + slab->poison = get_obj("poison"); + slab->reclaim_account = get_obj("reclaim_account"); + slab->red_zone = get_obj("red_zone"); + slab->sanity_checks = get_obj("sanity_checks"); + slab->slab_size = get_obj("slab_size"); + slab->slabs = get_obj_and_str("slabs", &t); + decode_numa_list(slab->numa, t); + slab->store_user = get_obj("store_user"); + slab->trace = get_obj("trace"); + chdir(".."); + if (slab->name[0] == ':') + alias_targets++; + slab++; + break; + default : + fatal("Unknown file type %lx\n", de->d_type); + } + } + closedir(dir); + slabs = slab - slabinfo; + aliases = alias - aliasinfo; + if (slabs > MAX_SLABS) + fatal("Too many slabs\n"); + if (aliases > MAX_ALIASES) + fatal("Too many aliases\n"); +} + +void output_slabs(void) +{ + struct slabinfo *slab; + + for (slab = slabinfo; slab < slabinfo + slabs; slab++) { + + if (slab->alias) + continue; + + + if (show_numa) + slab_numa(slab); + else + if (show_track) + show_tracking(slab); + else + if (validate) + slab_validate(slab); + else + if (shrink) + slab_shrink(slab); + else { + if (show_slab) + slabcache(slab); + } + } +} + +struct option opts[] = { + { "aliases", 0, NULL, 'a' }, + { "slabs", 0, NULL, 'l' }, + { "numa", 0, NULL, 'n' }, + { "zero", 0, NULL, 'z' }, + { "help", 0, NULL, 'h' }, + { "validate", 0, NULL, 'v' }, + { "first-alias", 0, NULL, 'f' }, + { "shrink", 0, NULL, 's' }, + { "track", 0, NULL, 't'}, + { "inverted", 0, NULL, 'i'}, + { "1ref", 0, NULL, '1'}, + { NULL, 0, NULL, 0 } +}; + +int main(int argc, char *argv[]) +{ + int c; + int err; + char *pattern_source; + + page_size = getpagesize(); + if (chdir("/sys/slab")) + fatal("This kernel does not have SLUB support.\n"); + + while ((c = getopt_long(argc, argv, "afhil1npstvzTS", opts, NULL)) != -1) + switch(c) { + case '1': + show_single_ref = 1; + break; + case 'a': + show_alias = 1; + break; + case 'f': + show_first_alias = 1; + break; + case 'h': + usage(); + return 0; + case 'i': + show_inverted = 1; + break; + case 'n': + show_numa = 1; + break; + case 's': + shrink = 1; + break; + case 'l': + show_slab = 1; + break; + case 't': + show_track = 1; + break; + case 'v': + validate = 1; + break; + case 'z': + skip_zero = 0; + break; + case 'T': + show_totals = 1; + break; + case 'S': + sort_size = 1; + break; + + default: + fatal("%s: Invalid option '%c'\n", argv[0], optopt); + + } + + if (!show_slab && !show_alias && !show_track + && !validate && !shrink) + show_slab = 1; + + if (argc > optind) + pattern_source = argv[optind]; + else + pattern_source = ".*"; + + err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB); + if (err) + fatal("%s: Invalid pattern '%s' code %d\n", + argv[0], pattern_source, err); + read_slab_dir(); + if (show_alias) + alias(); + else + if (show_totals) + totals(); + else { + link_slabs(); + rename_slabs(); + sort_slabs(); + output_slabs(); + } + return 0; +} diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt new file mode 100644 index 00000000000..727c8d81aea --- /dev/null +++ b/Documentation/vm/slub.txt @@ -0,0 +1,113 @@ +Short users guide for SLUB +-------------------------- + +First of all slub should transparently replace SLAB. If you enable +SLUB then everything should work the same (Note the word "should". +There is likely not much value in that word at this point). + +The basic philosophy of SLUB is very different from SLAB. SLAB +requires rebuilding the kernel to activate debug options for all +SLABS. SLUB always includes full debugging but its off by default. +SLUB can enable debugging only for selected slabs in order to avoid +an impact on overall system performance which may make a bug more +difficult to find. + +In order to switch debugging on one can add a option "slub_debug" +to the kernel command line. That will enable full debugging for +all slabs. + +Typically one would then use the "slabinfo" command to get statistical +data and perform operation on the slabs. By default slabinfo only lists +slabs that have data in them. See "slabinfo -h" for more options when +running the command. slabinfo can be compiled with + +gcc -o slabinfo Documentation/vm/slabinfo.c + +Some of the modes of operation of slabinfo require that slub debugging +be enabled on the command line. F.e. no tracking information will be +available without debugging on and validation can only partially +be performed if debugging was not switched on. + +Some more sophisticated uses of slub_debug: +------------------------------------------- + +Parameters may be given to slub_debug. If none is specified then full +debugging is enabled. Format: + +slub_debug=<Debug-Options> Enable options for all slabs +slub_debug=<Debug-Options>,<slab name> + Enable options only for select slabs + +Possible debug options are + F Sanity checks on (enables SLAB_DEBUG_FREE. Sorry + SLAB legacy issues) + Z Red zoning + P Poisoning (object and padding) + U User tracking (free and alloc) + T Trace (please only use on single slabs) + +F.e. in order to boot just with sanity checks and red zoning one would specify: + + slub_debug=FZ + +Trying to find an issue in the dentry cache? Try + + slub_debug=,dentry_cache + +to only enable debugging on the dentry cache. + +Red zoning and tracking may realign the slab. We can just apply sanity checks +to the dentry cache with + + slub_debug=F,dentry_cache + +In case you forgot to enable debugging on the kernel command line: It is +possible to enable debugging manually when the kernel is up. Look at the +contents of: + +/sys/slab/<slab name>/ + +Look at the writable files. Writing 1 to them will enable the +corresponding debug option. All options can be set on a slab that does +not contain objects. If the slab already contains objects then sanity checks +and tracing may only be enabled. The other options may cause the realignment +of objects. + +Careful with tracing: It may spew out lots of information and never stop if +used on the wrong slab. + +SLAB Merging +------------ + +If no debugging is specified then SLUB may merge similar slabs together +in order to reduce overhead and increase cache hotness of objects. +slabinfo -a displays which slabs were merged together. + +Getting more performance +------------------------ + +To some degree SLUB's performance is limited by the need to take the +list_lock once in a while to deal with partial slabs. That overhead is +governed by the order of the allocation for each slab. The allocations +can be influenced by kernel parameters: + +slub_min_objects=x (default 8) +slub_min_order=x (default 0) +slub_max_order=x (default 4) + +slub_min_objects allows to specify how many objects must at least fit +into one slab in order for the allocation order to be acceptable. +In general slub will be able to perform this number of allocations +on a slab without consulting centralized resources (list_lock) where +contention may occur. + +slub_min_order specifies a minim order of slabs. A similar effect like +slub_min_objects. + +slub_max_order specified the order at which slub_min_objects should no +longer be checked. This is useful to avoid SLUB trying to generate +super large order pages to fit slub_min_objects of a slab cache with +large object sizes into one high order page. + + +Christoph Lameter, <clameter@sgi.com>, April 10, 2007 diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 85f51e5a749..6177d881983 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -149,7 +149,19 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup - numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. + numa=fake=CMDLINE + If a number, fakes CMDLINE nodes and ignores NUMA setup of the + actual machine. Otherwise, system memory is configured + depending on the sizes and coefficients listed. For example: + numa=fake=2*512,1024,4*256,*128 + gives two 512M nodes, a 1024M node, four 256M nodes, and the + rest split into 128M chunks. If the last character of CMDLINE + is a *, the remaining memory is divided up equally among its + coefficient: + numa=fake=2*512,2* + gives two 512M nodes and the rest split into two nodes. + Otherwise, the remaining system RAM is allocated to an + additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/Documentation/x86_64/fake-numa-for-cpusets b/Documentation/x86_64/fake-numa-for-cpusets new file mode 100644 index 00000000000..d1a985c5b00 --- /dev/null +++ b/Documentation/x86_64/fake-numa-for-cpusets @@ -0,0 +1,66 @@ +Using numa=fake and CPUSets for Resource Management +Written by David Rientjes <rientjes@cs.washington.edu> + +This document describes how the numa=fake x86_64 command-line option can be used +in conjunction with cpusets for coarse memory management. Using this feature, +you can create fake NUMA nodes that represent contiguous chunks of memory and +assign them to cpusets and their attached tasks. This is a way of limiting the +amount of system memory that are available to a certain class of tasks. + +For more information on the features of cpusets, see Documentation/cpusets.txt. +There are a number of different configurations you can use for your needs. For +more information on the numa=fake command line option and its various ways of +configuring fake nodes, see Documentation/x86_64/boot-options.txt. + +For the purposes of this introduction, we'll assume a very primitive NUMA +emulation setup of "numa=fake=4*512,". This will split our system memory into +four equal chunks of 512M each that we can now use to assign to cpusets. As +you become more familiar with using this combination for resource control, +you'll determine a better setup to minimize the number of nodes you have to deal +with. + +A machine may be split as follows with "numa=fake=4*512," as reported by dmesg: + + Faking node 0 at 0000000000000000-0000000020000000 (512MB) + Faking node 1 at 0000000020000000-0000000040000000 (512MB) + Faking node 2 at 0000000040000000-0000000060000000 (512MB) + Faking node 3 at 0000000060000000-0000000080000000 (512MB) + ... + On node 0 totalpages: 130975 + On node 1 totalpages: 131072 + On node 2 totalpages: 131072 + On node 3 totalpages: 131072 + +Now following the instructions for mounting the cpusets filesystem from +Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory +address spaces) to individual cpusets: + + [root@xroads /]# mkdir exampleset + [root@xroads /]# mount -t cpuset none exampleset + [root@xroads /]# mkdir exampleset/ddset + [root@xroads /]# cd exampleset/ddset + [root@xroads /exampleset/ddset]# echo 0-1 > cpus + [root@xroads /exampleset/ddset]# echo 0-1 > mems + +Now this cpuset, 'ddset', will only allowed access to fake nodes 0 and 1 for +memory allocations (1G). + +You can now assign tasks to these cpusets to limit the memory resources +available to them according to the fake nodes assigned as mems: + + [root@xroads /exampleset/ddset]# echo $$ > tasks + [root@xroads /exampleset/ddset]# dd if=/dev/zero of=tmp bs=1024 count=1G + [1] 13425 + +Notice the difference between the system memory usage as reported by +/proc/meminfo between the restricted cpuset case above and the unrestricted +case (i.e. running the same 'dd' command without assigning it to a fake NUMA +cpuset): + Unrestricted Restricted + MemTotal: 3091900 kB 3091900 kB + MemFree: 42113 kB 1513236 kB + +This allows for coarse memory management for the tasks you assign to particular +cpusets. Since cpusets can form a hierarchy, you can create some pretty +interesting combinations of use-cases for various classes of tasks for your +memory management needs. diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck index 068a6d9904b..feaeaf6f6e4 100644 --- a/Documentation/x86_64/machinecheck +++ b/Documentation/x86_64/machinecheck @@ -36,7 +36,12 @@ between all CPUs. check_interval How often to poll for corrected machine check errors, in seconds - (Note output is hexademical). Default 5 minutes. + (Note output is hexademical). Default 5 minutes. When the poller + finds MCEs it triggers an exponential speedup (poll more often) on + the polling interval. When the poller stops finding MCEs, it + triggers an exponential backoff (poll less often) on the polling + interval. The check_interval variable is both the initial and + maximum polling interval. tolerant Tolerance level. When a machine check exception occurs for a non |