diff options
508 files changed, 9801 insertions, 3351 deletions
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 8e145857fc9..df0d089d0fb 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl @@ -568,7 +568,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) <para> The blocks in which the tables are stored are procteted against accidental access by marking them bad in the memory bad block - table. The bad block table managment functions are allowed + table. The bad block table management functions are allowed to circumvernt this protection. </para> <para> diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl index 10a150ae2a7..d87f4569e76 100644 --- a/Documentation/DocBook/scsi.tmpl +++ b/Documentation/DocBook/scsi.tmpl @@ -317,7 +317,7 @@ <para> The SAS transport class contains common code to deal with SAS HBAs, an aproximated representation of SAS topologies in the driver model, - and various sysfs attributes to expose these topologies and managment + and various sysfs attributes to expose these topologies and management interfaces to userspace. </para> <para> diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index 5c555a8b39e..b7f9d3b4bbf 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -183,7 +183,7 @@ the MAN-PAGES maintainer (as listed in the MAINTAINERS file) a man-pages patch, or at least a notification of the change, so that some information makes its way into the manual pages. -Even if the maintainer did not respond in step #4, make sure to ALWAYS +Even if the maintainer did not respond in step #5, make sure to ALWAYS copy the maintainer when you change their code. For small patches you may want to CC the Trivial Patch Monkey diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt index 05d81cbcb2e..5920fe26e6f 100644 --- a/Documentation/filesystems/nfs41-server.txt +++ b/Documentation/filesystems/nfs41-server.txt @@ -11,6 +11,11 @@ the /proc/fs/nfsd/versions control file. Note that to write this control file, the nfsd service must be taken down. Use your user-mode nfs-utils to set this up; see rpc.nfsd(8) +(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and +"-4", respectively. Therefore, code meant to work on both new and old +kernels must turn 4.1 on or off *before* turning support for version 4 +on or off; rpc.nfsd does this correctly.) + The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based on the latest NFSv4.1 Internet Draft: http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 @@ -25,6 +30,49 @@ are still under development out of tree. See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design for more information. +The current implementation is intended for developers only: while it +does support ordinary file operations on clients we have tested against +(including the linux client), it is incomplete in ways which may limit +features unexpectedly, cause known bugs in rare cases, or cause +interoperability problems with future clients. Known issues: + + - gss support is questionable: currently mounts with kerberos + from a linux client are possible, but we aren't really + conformant with the spec (for example, we don't use kerberos + on the backchannel correctly). + - no trunking support: no clients currently take advantage of + trunking, but this is a mandatory failure, and its use is + recommended to clients in a number of places. (E.g. to ensure + timely renewal in case an existing connection's retry timeouts + have gotten too long; see section 8.3 of the draft.) + Therefore, lack of this feature may cause future clients to + fail. + - Incomplete backchannel support: incomplete backchannel gss + support and no support for BACKCHANNEL_CTL mean that + callbacks (hence delegations and layouts) may not be + available and clients confused by the incomplete + implementation may fail. + - Server reboot recovery is unsupported; if the server reboots, + clients may fail. + - We do not support SSV, which provides security for shared + client-server state (thus preventing unauthorized tampering + with locks and opens, for example). It is mandatory for + servers to support this, though no clients use it yet. + - Mandatory operations which we do not support, such as + DESTROY_CLIENTID, FREE_STATEID, SECINFO_NO_NAME, and + TEST_STATEID, are not currently used by clients, but will be + (and the spec recommends their uses in common cases), and + clients should not be expected to know how to recover from the + case where they are not supported. This will eventually cause + interoperability failures. + +In addition, some limitations are inherited from the current NFSv4 +implementation: + + - Incomplete delegation enforcement: if a file is renamed or + unlinked, a client holding a delegation may continue to + indefinitely allow opens of the file under the old name. + The table below, taken from the NFSv4.1 document, lists the operations that are mandatory to implement (REQ), optional (OPT), and NFSv4.0 operations that are required not to implement (MNI) @@ -142,6 +190,12 @@ NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | Implementation notes: +DELEGPURGE: +* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or + CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that + persist across client reboots). Thus we need not implement this for + now. + EXCHANGE_ID: * only SP4_NONE state protection supported * implementation ids are ignored diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt index 68baddf3c3e..3ba0b945aaf 100644 --- a/Documentation/filesystems/nfsroot.txt +++ b/Documentation/filesystems/nfsroot.txt @@ -105,7 +105,7 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf> the client address and this parameter is NOT empty only replies from the specified server are accepted. - Only required for for NFS root. That is autoconfiguration + Only required for NFS root. That is autoconfiguration will not be triggered if it is missing and NFS root is not in operation. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ffead13f944..75988ba26a5 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -375,6 +375,19 @@ of memory currently marked as referenced or accessed. This file is only present if the CONFIG_MMU kernel configuration option is enabled. +The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG +bits on both physical and virtual pages associated with a process. +To clear the bits for all the pages associated with the process + > echo 1 > /proc/PID/clear_refs + +To clear the bits for the anonymous pages associated with the process + > echo 2 > /proc/PID/clear_refs + +To clear the bits for the file mapped pages associated with the process + > echo 3 > /proc/PID/clear_refs +Any other value written to /proc/PID/clear_refs will have no effect. + + 1.2 Kernel data --------------- @@ -1032,9 +1045,9 @@ Various pieces of information about kernel activity are available in the since the system first booted. For a quick look, simply cat the file: > cat /proc/stat - cpu 2255 34 2290 22625563 6290 127 456 0 - cpu0 1132 34 1441 11311718 3675 127 438 0 - cpu1 1123 0 849 11313845 2614 0 18 0 + cpu 2255 34 2290 22625563 6290 127 456 0 0 + cpu0 1132 34 1441 11311718 3675 127 438 0 0 + cpu1 1123 0 849 11313845 2614 0 18 0 0 intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] ctxt 1990473 btime 1062191376 @@ -1056,6 +1069,7 @@ second). The meanings of the columns are as follows, from left to right: - irq: servicing interrupts - softirq: servicing softirqs - steal: involuntary wait +- guest: running a guest The "intr" line gives counts of interrupts serviced since boot time, for each of the possible system interrupts. The first column is the total of all @@ -1191,7 +1205,7 @@ The following heuristics are then applied: * if the task was reniced, its score doubles * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE or CAP_SYS_RAWIO) have their score divided by 4 - * if oom condition happened in one cpuset and checked task does not belong + * if oom condition happened in one cpuset and checked process does not belong to it, its score is divided by 8 * the resulting score is multiplied by two to the power of oom_adj, i.e. points <<= oom_adj when it is positive and diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt index 40ec6335276..e7ca6478cd9 100644 --- a/Documentation/gcov.txt +++ b/Documentation/gcov.txt @@ -47,7 +47,7 @@ Possible uses: Configure the kernel with: - CONFIG_DEBUGFS=y + CONFIG_DEBUG_FS=y CONFIG_GCOV_KERNEL=y and to get coverage data for the entire kernel: diff --git a/Documentation/hwmon/hpfall.c b/Documentation/hwmon/hpfall.c index bbea1ccfd46..681ec22b9d0 100644 --- a/Documentation/hwmon/hpfall.c +++ b/Documentation/hwmon/hpfall.c @@ -16,6 +16,34 @@ #include <stdint.h> #include <errno.h> #include <signal.h> +#include <sys/mman.h> +#include <sched.h> + +char unload_heads_path[64]; + +int set_unload_heads_path(char *device) +{ + char devname[64]; + + if (strlen(device) <= 5 || strncmp(device, "/dev/", 5) != 0) + return -EINVAL; + strncpy(devname, device + 5, sizeof(devname)); + + snprintf(unload_heads_path, sizeof(unload_heads_path), + "/sys/block/%s/device/unload_heads", devname); + return 0; +} +int valid_disk(void) +{ + int fd = open(unload_heads_path, O_RDONLY); + if (fd < 0) { + perror(unload_heads_path); + return 0; + } + + close(fd); + return 1; +} void write_int(char *path, int i) { @@ -40,7 +68,7 @@ void set_led(int on) void protect(int seconds) { - write_int("/sys/block/sda/device/unload_heads", seconds*1000); + write_int(unload_heads_path, seconds*1000); } int on_ac(void) @@ -57,45 +85,62 @@ void ignore_me(void) { protect(0); set_led(0); - } -int main(int argc, char* argv[]) +int main(int argc, char **argv) { - int fd, ret; + int fd, ret; + struct sched_param param; + + if (argc == 1) + ret = set_unload_heads_path("/dev/sda"); + else if (argc == 2) + ret = set_unload_heads_path(argv[1]); + else + ret = -EINVAL; + + if (ret || !valid_disk()) { + fprintf(stderr, "usage: %s <device> (default: /dev/sda)\n", + argv[0]); + exit(1); + } + + fd = open("/dev/freefall", O_RDONLY); + if (fd < 0) { + perror("/dev/freefall"); + return EXIT_FAILURE; + } - fd = open("/dev/freefall", O_RDONLY); - if (fd < 0) { - perror("open"); - return EXIT_FAILURE; - } + daemon(0, 0); + param.sched_priority = sched_get_priority_max(SCHED_FIFO); + sched_setscheduler(0, SCHED_FIFO, ¶m); + mlockall(MCL_CURRENT|MCL_FUTURE); signal(SIGALRM, ignore_me); - for (;;) { - unsigned char count; - - ret = read(fd, &count, sizeof(count)); - alarm(0); - if ((ret == -1) && (errno == EINTR)) { - /* Alarm expired, time to unpark the heads */ - continue; - } - - if (ret != sizeof(count)) { - perror("read"); - break; - } - - protect(21); - set_led(1); - if (1 || on_ac() || lid_open()) { - alarm(2); - } else { - alarm(20); - } - } - - close(fd); - return EXIT_SUCCESS; + for (;;) { + unsigned char count; + + ret = read(fd, &count, sizeof(count)); + alarm(0); + if ((ret == -1) && (errno == EINTR)) { + /* Alarm expired, time to unpark the heads */ + continue; + } + + if (ret != sizeof(count)) { + perror("read"); + break; + } + + protect(21); + set_led(1); + if (1 || on_ac() || lid_open()) + alarm(2); + else + alarm(20); + } + + close(fd); + return EXIT_SUCCESS; } diff --git a/Documentation/hwmon/pc87427 b/Documentation/hwmon/pc87427 index d1ebbe510f3..db5cc1227a8 100644 --- a/Documentation/hwmon/pc87427 +++ b/Documentation/hwmon/pc87427 @@ -34,5 +34,5 @@ Fan rotation speeds are reported as 14-bit values from a gated clock signal. Speeds down to 83 RPM can be measured. An alarm is triggered if the rotation speed drops below a programmable -limit. Another alarm is triggered if the speed is too low to to be measured +limit. Another alarm is triggered if the speed is too low to be measured (including stalled or missing fan). diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0f17d16dc10..c363840cdce 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -933,7 +933,7 @@ and is between 256 and 4096 characters. It is defined in the file 1 -- enable informational integrity auditing messages. ima_hash= [IMA] - Formt: { "sha1" | "md5" } + Format: { "sha1" | "md5" } default: "sha1" ima_tcb [IMA] diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt index 363044609da..c28f82895d6 100644 --- a/Documentation/kmemcheck.txt +++ b/Documentation/kmemcheck.txt @@ -43,26 +43,7 @@ feature. 1. Downloading ============== -kmemcheck can only be downloaded using git. If you want to write patches -against the current code, you should use the kmemcheck development branch of -the tip tree. It is also possible to use the linux-next tree, which also -includes the latest version of kmemcheck. - -Assuming that you've already cloned the linux-2.6.git repository, all you -have to do is add the -tip tree as a remote, like this: - - $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git - -To actually download the tree, fetch the remote: - - $ git fetch tip - -And to check out a new local branch with the kmemcheck code: - - $ git checkout -b kmemcheck tip/kmemcheck - -General instructions for the -tip tree can be found here: -http://people.redhat.com/mingo/tip.git/readme.txt +As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel. 2. Configuring and compiling diff --git a/Documentation/memory.txt b/Documentation/memory.txt index 2b3dedd3953..802efe58647 100644 --- a/Documentation/memory.txt +++ b/Documentation/memory.txt @@ -1,18 +1,7 @@ There are several classic problems related to memory on Linux systems. - 1) There are some buggy motherboards which cannot properly - deal with the memory above 16MB. Consider exchanging - your motherboard. - - 2) You cannot do DMA on the ISA bus to addresses above - 16M. Most device drivers under Linux allow the use - of bounce buffers which work around this problem. Drivers - that don't use bounce buffers will be unstable with - more than 16M installed. Drivers that use bounce buffers - will be OK, but may have slightly higher overhead. - - 3) There are some motherboards that will not cache above + 1) There are some motherboards that will not cache above a certain quantity of memory. If you have one of these motherboards, your system will be SLOWER, not faster as you add more memory. Consider exchanging your @@ -24,7 +13,7 @@ It can also tell Linux to use less memory than is actually installed. If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid physical address space collisions. -See the documentation of your boot loader (LILO, loadlin, etc.) about +See the documentation of your boot loader (LILO, grub, loadlin, etc.) about how to pass options to the kernel. There are other memory problems which Linux cannot deal with. Random @@ -42,19 +31,3 @@ Try: with the vendor. Consider testing it with memtest86 yourself. * Exchanging your CPU, cache, or motherboard for one that works. - - * Disabling the cache from the BIOS. - - * Try passing the "mem=4M" option to the kernel to limit - Linux to using a very small amount of memory. Use "memmap="-option - together with "mem=" on systems with PCI to avoid physical address - space collisions. - - -Other tricks: - - * Try passing the "no-387" option to the kernel to ignore - a buggy FPU. - - * Try passing the "no-hlt" option to disable the potentially - buggy HLT instruction in your CPU. diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index eaa1a25946c..ee31369e9e5 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt @@ -96,7 +96,7 @@ Example code - drivers hinting an alpha2: This example comes from the zd1211rw device driver. You can start by having a mapping of your device's EEPROM country/regulatory -domain value to to a specific alpha2 as follows: +domain value to a specific alpha2 as follows: static struct zd_reg_alpha2_map reg_alpha2_map[] = { { ZD_REGDOMAIN_FCC, "US" }, diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt index 80133ace1eb..9fcc9a608dc 100644 --- a/Documentation/numastat.txt +++ b/Documentation/numastat.txt @@ -7,10 +7,10 @@ All units are pages. Hugepages have separate counters. numa_hit A process wanted to allocate memory from this node, and succeeded. -numa_miss A process wanted to allocate memory from this node, - but ended up with memory from another. -numa_foreign A process wanted to allocate on another node, - but ended up with memory from this one. +numa_miss A process wanted to allocate memory from another node, + but ended up with memory from this node. +numa_foreign A process wanted to allocate on this node, + but ended up with memory from another one. local_node A process ran on this node and got memory from it. other_node A process ran on this node and got memory from another node. interleave_hit Interleaving wanted to allocate from this node diff --git a/Documentation/powerpc/dts-bindings/marvell.txt b/Documentation/powerpc/dts-bindings/marvell.txt index 3708a2fd474..f1533d91953 100644 --- a/Documentation/powerpc/dts-bindings/marvell.txt +++ b/Documentation/powerpc/dts-bindings/marvell.txt @@ -32,7 +32,7 @@ prefixed with the string "marvell,", for Marvell Technology Group Ltd. devices. This field represents the number of cells needed to represent the address of the memory-mapped registers of devices within the system controller chip. - - #size-cells : Size representation for for the memory-mapped + - #size-cells : Size representation for the memory-mapped registers within the system controller chip. - #interrupt-cells : Defines the width of cells used to represent interrupts. diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid index eaa4801f2ce..38e9e7cadc9 100644 --- a/Documentation/scsi/ChangeLog.megaraid +++ b/Documentation/scsi/ChangeLog.megaraid @@ -514,7 +514,7 @@ iv. Remove yield() while mailbox handshake in synchronous commands v. Remove redundant __megaraid_busywait_mbox routine -vi. Fix bug in the managment module, which causes a system lockup when the +vi. Fix bug in the management module, which causes a system lockup when the IO module is loaded and then unloaded, followed by executing any management utility. The current version of management module does not handle the adapter unregister properly. diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt index d7f181701dc..aec6549ab09 100644 --- a/Documentation/scsi/scsi_fc_transport.txt +++ b/Documentation/scsi/scsi_fc_transport.txt @@ -378,7 +378,7 @@ Vport Disable/Enable: int vport_disable(struct fc_vport *vport, bool disable) where: - vport: Is vport to to be enabled or disabled + vport: Is vport to be enabled or disabled disable: If "true", the vport is to be disabled. If "false", the vport is to be enabled. diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 97eebd63bed..f1708b79f96 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt @@ -387,7 +387,7 @@ STAC92HD73* STAC92HD83* =========== ref Reference board - mic-ref Reference board with power managment for ports + mic-ref Reference board with power management for ports dell-s14 Dell laptop auto BIOS setup (default) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 2dbff53369d..3e5b63ebb82 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -319,25 +319,29 @@ This option can be used to select the type of process address space randomization that is used in the system, for architectures that support this feature. -0 - Turn the process address space randomization off by default. +0 - Turn the process address space randomization off. This is the + default for architectures that do not support this feature anyways, + and kernels that are booted with the "norandmaps" parameter. 1 - Make the addresses of mmap base, stack and VDSO page randomized. This, among other things, implies that shared libraries will be - loaded to random addresses. Also for PIE-linked binaries, the location - of code start is randomized. + loaded to random addresses. Also for PIE-linked binaries, the + location of code start is randomized. This is the default if the + CONFIG_COMPAT_BRK option is enabled. - With heap randomization, the situation is a little bit more - complicated. - There a few legacy applications out there (such as some ancient +2 - Additionally enable heap randomization. This is the default if + CONFIG_COMPAT_BRK is disabled. + + There are a few legacy applications out there (such as some ancient versions of libc.so.5 from 1996) that assume that brk area starts - just after the end of the code+bss. These applications break when - start of the brk area is randomized. There are however no known + just after the end of the code+bss. These applications break when + start of the brk area is randomized. There are however no known non-legacy applications that would be broken this way, so for most - systems it is safe to choose full randomization. However there is - a CONFIG_COMPAT_BRK option for systems with ancient and/or broken - binaries, that makes heap non-randomized, but keeps all other - parts of process address space randomized if randomize_va_space - sysctl is turned on. + systems it is safe to choose full randomization. + + Systems with ancient and/or broken binaries should be configured + with CONFIG_COMPAT_BRK enabled, which excludes the heap from process + address space randomization. ============================================================== diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index c4de6359d44..e6fb1ec2744 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -585,7 +585,9 @@ caching of directory and inode objects. At the default value of vfs_cache_pressure=100 the kernel will attempt to reclaim dentries and inodes at a "fair" rate with respect to pagecache and swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer -to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 +to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will +never reclaim dentries and inodes due to memory pressure and this can easily +lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 causes the kernel to prefer to reclaim dentries and inodes. ============================================================== diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt new file mode 100644 index 00000000000..6ef2a8652e1 --- /dev/null +++ b/Documentation/trace/events-kmem.txt @@ -0,0 +1,107 @@ + Subsystem Trace Points: kmem + +The tracing system kmem captures events related to object and page allocation +within the kernel. Broadly speaking there are four major subheadings. + + o Slab allocation of small objects of unknown type (kmalloc) + o Slab allocation of small objects of known type + o Page allocation + o Per-CPU Allocator Activity + o External Fragmentation + +This document will describe what each of the tracepoints are and why they +might be useful. + +1. Slab allocation of small objects of unknown type +=================================================== +kmalloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s +kmalloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d +kfree call_site=%lx ptr=%p + +Heavy activity for these events may indicate that a specific cache is +justified, particularly if kmalloc slab pages are getting significantly +internal fragmented as a result of the allocation pattern. By correlating +kmalloc with kfree, it may be possible to identify memory leaks and where +the allocation sites were. + + +2. Slab allocation of small objects of known type +================================================= +kmem_cache_alloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s +kmem_cache_alloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d +kmem_cache_free call_site=%lx ptr=%p + +These events are similar in usage to the kmalloc-related events except that +it is likely easier to pin the event down to a specific cache. At the time +of writing, no information is available on what slab is being allocated from, +but the call_site can usually be used to extrapolate that information + +3. Page allocation +================== +mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s +mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d +mm_page_free_direct page=%p pfn=%lu order=%d +mm_pagevec_free page=%p pfn=%lu order=%d cold=%d + +These four events deal with page allocation and freeing. mm_page_alloc is +a simple indicator of page allocator activity. Pages may be allocated from +the per-CPU allocator (high performance) or the buddy allocator. + +If pages are allocated directly from the buddy allocator, the +mm_page_alloc_zone_locked event is triggered. This event is important as high +amounts of activity imply high activity on the zone->lock. Taking this lock +impairs performance by disabling interrupts, dirtying cache lines between +CPUs and serialising many CPUs. + +When a page is freed directly by the caller, the mm_page_free_direct event +is triggered. Significant amounts of activity here could indicate that the +callers should be batching their activities. + +When pages are freed using a pagevec, the mm_pagevec_free is +triggered. Broadly speaking, pages are taken off the LRU lock in bulk and +freed in batch with a pagevec. Significant amounts of activity here could +indicate that the system is under memory pressure and can also indicate +contention on the zone->lru_lock. + +4. Per-CPU Allocator Activity +============================= +mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d +mm_page_pcpu_drain page=%p pfn=%lu order=%d cpu=%d migratetype=%d + +In front of the page allocator is a per-cpu page allocator. It exists only +for order-0 pages, reduces contention on the zone->lock and reduces the +amount of writing on struct page. + +When a per-CPU list is empty or pages of the wrong type are allocated, +the zone->lock will be taken once and the per-CPU list refilled. The event +triggered is mm_page_alloc_zone_locked for each page allocated with the +event indicating whether it is for a percpu_refill or not. + +When the per-CPU list is too full, a number of pages are freed, each one +which triggers a mm_page_pcpu_drain event. + +The individual nature of the events are so that pages can be tracked +between allocation and freeing. A number of drain or refill pages that occur +consecutively imply the zone->lock being taken once. Large amounts of PCP +refills and drains could imply an imbalance between CPUs where too much work +is being concentrated in one place. It could also indicate that the per-CPU +lists should be a larger size. Finally, large amounts of refills on one CPU +and drains on another could be a factor in causing large amounts of cache +line bounces due to writes between CPUs and worth investigating if pages +can be allocated and freed on the same CPU through some algorithm change. + +5. External Fragmentation +========================= +mm_page_alloc_extfrag page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d + +External fragmentation affects whether a high-order allocation will be +successful or not. For some types of hardware, this is important although +it is avoided where possible. If the system is using huge pages and needs +to be able to resize the pool over the lifetime of the system, this value +is important. + +Large numbers of this event implies that memory is fragmenting and +high-order allocations will start failing at some time in the future. One +means of reducing the occurange of this event is to increase the size of +min_free_kbytes in increments of 3*pageblock_size*nr_online_nodes where +pageblock_size is usually the size of the default hugepage size. diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index 78c45a87be5..02ac6ed38b2 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -72,7 +72,7 @@ To enable all events in sched subsystem: # echo 1 > /sys/kernel/debug/tracing/events/sched/enable -To eanble all events: +To enable all events: # echo 1 > /sys/kernel/debug/tracing/events/enable diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 1b6292bbdd6..957b22fde2d 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -133,7 +133,7 @@ of ftrace. Here is a list of some of the key files: than requested, the rest of the page will be used, making the actual allocation bigger than requested. ( Note, the size may not be a multiple of the page size - due to buffer managment overhead. ) + due to buffer management overhead. ) This can only be updated when the current_tracer is set to "nop". diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl new file mode 100644 index 00000000000..7df50e8cf4d --- /dev/null +++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl @@ -0,0 +1,418 @@ +#!/usr/bin/perl +# This is a POC (proof of concept or piece of crap, take your pick) for reading the +# text representation of trace output related to page allocation. It makes an attempt +# to extract some high-level information on what is going on. The accuracy of the parser +# may vary considerably +# +# Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe +# other options +# --prepend-parent Report on the parent proc and PID +# --read-procstat If the trace lacks process info, get it from /proc +# --ignore-pid Aggregate processes of the same name together +# +# Copyright (c) IBM Corporation 2009 +# Author: Mel Gorman <mel@csn.ul.ie> +use strict; +use Getopt::Long; + +# Tracepoint events +use constant MM_PAGE_ALLOC => 1; +use constant MM_PAGE_FREE_DIRECT => 2; +use constant MM_PAGEVEC_FREE => 3; +use constant MM_PAGE_PCPU_DRAIN => 4; +use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; +use constant MM_PAGE_ALLOC_EXTFRAG => 6; +use constant EVENT_UNKNOWN => 7; + +# Constants used to track state +use constant STATE_PCPU_PAGES_DRAINED => 8; +use constant STATE_PCPU_PAGES_REFILLED => 9; + +# High-level events extrapolated from tracepoints +use constant HIGH_PCPU_DRAINS => 10; +use constant HIGH_PCPU_REFILLS => 11; +use constant HIGH_EXT_FRAGMENT => 12; +use constant HIGH_EXT_FRAGMENT_SEVERE => 13; +use constant HIGH_EXT_FRAGMENT_MODERATE => 14; +use constant HIGH_EXT_FRAGMENT_CHANGED => 15; + +my %perprocesspid; +my %perprocess; +my $opt_ignorepid; +my $opt_read_procstat; +my $opt_prepend_parent; + +# Catch sigint and exit on request +my $sigint_report = 0; +my $sigint_exit = 0; +my $sigint_pending = 0; +my $sigint_received = 0; +sub sigint_handler { + my $current_time = time; + if ($current_time - 2 > $sigint_received) { + print "SIGINT received, report pending. Hit ctrl-c again to exit\n"; + $sigint_report = 1; + } else { + if (!$sigint_exit) { + print "Second SIGINT received quickly, exiting\n"; + } + $sigint_exit++; + } + + if ($sigint_exit > 3) { + print "Many SIGINTs received, exiting now without report\n"; + exit; + } + + $sigint_received = $current_time; + $sigint_pending = 1; +} +$SIG{INT} = "sigint_handler"; + +# Parse command line options +GetOptions( + 'ignore-pid' => \$opt_ignorepid, + 'read-procstat' => \$opt_read_procstat, + 'prepend-parent' => \$opt_prepend_parent, +); + +# Defaults for dynamically discovered regex's +my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])'; + +# Dyanically discovered regex +my $regex_fragdetails; + +# Static regex used. Specified like this for readability and for use with /o +# (process_pid) (cpus ) ( time ) (tpoint ) (details) +my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)'; +my $regex_statname = '[-0-9]*\s\((.*)\).*'; +my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*'; + +sub generate_traceevent_regex { + my $event = shift; + my $default = shift; + my $regex; + + # Read the event format or use the default + if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) { + $regex = $default; + } else { + my $line; + while (!eof(FORMAT)) { + $line = <FORMAT>; + if ($line =~ /^print fmt:\s"(.*)",.*/) { + $regex = $1; + $regex =~ s/%p/\([0-9a-f]*\)/g; + $regex =~ s/%d/\([-0-9]*\)/g; + $regex =~ s/%lu/\([0-9]*\)/g; + } + } + } + + # Verify fields are in the right order + my $tuple; + foreach $tuple (split /\s/, $regex) { + my ($key, $value) = split(/=/, $tuple); + my $expected = shift; + if ($key ne $expected) { + print("WARNING: Format not as expected '$key' != '$expected'"); + $regex =~ s/$key=\((.*)\)/$key=$1/; + } + } + + if (defined shift) { + die("Fewer fields than expected in format"); + } + + return $regex; +} +$regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag", + $regex_fragdetails_default, + "page", "pfn", + "alloc_order", "fallback_order", "pageblock_order", + "alloc_migratetype", "fallback_migratetype", + "fragmenting", "change_ownership"); + +sub read_statline($) { + my $pid = $_[0]; + my $statline; + + if (open(STAT, "/proc/$pid/stat")) { + $statline = <STAT>; + close(STAT); + } + + if ($statline eq '') { + $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0"; + } + + return $statline; +} + +sub guess_process_pid($$) { + my $pid = $_[0]; + my $statline = $_[1]; + + if ($pid == 0) { + return "swapper-0"; + } + + if ($statline !~ /$regex_statname/o) { + die("Failed to math stat line for process name :: $statline"); + } + return "$1-$pid"; +} + +sub parent_info($$) { + my $pid = $_[0]; + my $statline = $_[1]; + my $ppid; + + if ($pid == 0) { + return "NOPARENT-0"; + } + + if ($statline !~ /$regex_statppid/o) { + die("Failed to match stat line process ppid:: $statline"); + } + + # Read the ppid stat line + $ppid = $1; + return guess_process_pid($ppid, read_statline($ppid)); +} + +sub process_events { + my $traceevent; + my $process_pid; + my $cpus; + my $timestamp; + my $tracepoint; + my $details; + my $statline; + + # Read each line of the event log +EVENT_PROCESS: + while ($traceevent = <STDIN>) { + if ($traceevent =~ /$regex_traceevent/o) { + $process_pid = $1; + $tracepoint = $4; + + if ($opt_read_procstat || $opt_prepend_parent) { + $process_pid =~ /(.*)-([0-9]*)$/; + my $process = $1; + my $pid = $2; + + $statline = read_statline($pid); + + if ($opt_read_procstat && $process eq '') { + $process_pid = guess_process_pid($pid, $statline); + } + + if ($opt_prepend_parent) { + $process_pid = parent_info($pid, $statline) . " :: $process_pid"; + } + } + + # Unnecessary in this script. Uncomment if required + # $cpus = $2; + # $timestamp = $3; + } else { + next; + } + + # Perl Switch() sucks majorly + if ($tracepoint eq "mm_page_alloc") { + $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; + } elsif ($tracepoint eq "mm_page_free_direct") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++; + } elsif ($tracepoint eq "mm_pagevec_free") { + $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++; + } elsif ($tracepoint eq "mm_page_pcpu_drain") { + $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; + } elsif ($tracepoint eq "mm_page_alloc_zone_locked") { + $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++; + } elsif ($tracepoint eq "mm_page_alloc_extfrag") { + + # Extract the details of the event now + $details = $5; + + my ($page, $pfn); + my ($alloc_order, $fallback_order, $pageblock_order); + my ($alloc_migratetype, $fallback_migratetype); + my ($fragmenting, $change_ownership); + + if ($details !~ /$regex_fragdetails/o) { + print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n"; + next; + } + + $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++; + $page = $1; + $pfn = $2; + $alloc_order = $3; + $fallback_order = $4; + $pageblock_order = $5; + $alloc_migratetype = $6; + $fallback_migratetype = $7; + $fragmenting = $8; + $change_ownership = $9; + + if ($fragmenting) { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++; + if ($fallback_order <= 3) { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++; + } else { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++; + } + } + if ($change_ownership) { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++; + } + } else { + $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++; + } + + # Catch a full pcpu drain event + if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} && + $tracepoint ne "mm_page_pcpu_drain") { + + $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; + } + + # Catch a full pcpu refill event + if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} && + $tracepoint ne "mm_page_alloc_zone_locked") { + $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; + } + + if ($sigint_pending) { + last EVENT_PROCESS; + } + } +} + +sub dump_stats { + my $hashref = shift; + my %stats = %$hashref; + + # Dump per-process stats + my $process_pid; + my $max_strlen = 0; + + # Get the maximum process name + foreach $process_pid (keys %perprocesspid) { + my $len = length($process_pid); + if ($len > $max_strlen) { + $max_strlen = $len; + } + } + $max_strlen += 2; + + printf("\n"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", + "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", + "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", ""); + + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", + "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", ""); + + foreach $process_pid (keys %stats) { + # Dump final aggregates + if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) { + $stats{$process_pid}->{HIGH_PCPU_DRAINS}++; + $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; + } + if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) { + $stats{$process_pid}->{HIGH_PCPU_REFILLS}++; + $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; + } + + printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n", + $process_pid, + $stats{$process_pid}->{MM_PAGE_ALLOC}, + $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, + $stats{$process_pid}->{MM_PAGE_FREE_DIRECT}, + $stats{$process_pid}->{MM_PAGEVEC_FREE}, + $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, + $stats{$process_pid}->{HIGH_PCPU_DRAINS}, + $stats{$process_pid}->{HIGH_PCPU_REFILLS}, + $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}, + $stats{$process_pid}->{HIGH_EXT_FRAG}, + $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}, + $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}, + $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}, + $stats{$process_pid}->{EVENT_UNKNOWN}); + } +} + +sub aggregate_perprocesspid() { + my $process_pid; + my $process; + undef %perprocess; + + foreach $process_pid (keys %perprocesspid) { + $process = $process_pid; + $process =~ s/-([0-9])*$//; + if ($process eq '') { + $process = "NO_PROCESS_NAME"; + } + + $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; + $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; + $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}; + $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}; + $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; + $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; + $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; + $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}; + $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}; + $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}; + $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}; + $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}; + $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN}; + } +} + +sub report() { + if (!$opt_ignorepid) { + dump_stats(\%perprocesspid); + } else { + aggregate_perprocesspid(); + dump_stats(\%perprocess); + } +} + +# Process events or signals until neither is available +sub signal_loop() { + my $sigint_processed; + do { + $sigint_processed = 0; + process_events(); + + # Handle pending signals if any + if ($sigint_pending) { + my $current_time = time; + + if ($sigint_exit) { + print "Received exit signal\n"; + $sigint_pending = 0; + } + if ($sigint_report) { + if ($current_time >= $sigint_received + 2) { + report(); + $sigint_report = 0; + $sigint_pending = 0; + $sigint_processed = 1; + } + } + } + } while ($sigint_pending || $sigint_processed); +} + +signal_loop(); +report(); diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt new file mode 100644 index 00000000000..5eb4e487e66 --- /dev/null +++ b/Documentation/trace/tracepoint-analysis.txt @@ -0,0 +1,327 @@ + Notes on Analysing Behaviour Using Events and Tracepoints + + Documentation written by Mel Gorman + PCL information heavily based on email from Ingo Molnar + +1. Introduction +=============== + +Tracepoints (see Documentation/trace/tracepoints.txt) can be used without +creating custom kernel modules to register probe functions using the event +tracing infrastructure. + +Simplistically, tracepoints will represent an important event that when can +be taken in conjunction with other tracepoints to build a "Big Picture" of +what is going on within the system. There are a large number of methods for +gathering and interpreting these events. Lacking any current Best Practises, +this document describes some of the methods that can be used. + +This document assumes that debugfs is mounted on /sys/kernel/debug and that +the appropriate tracing options have been configured into the kernel. It is +assumed that the PCL tool tools/perf has been installed and is in your path. + +2. Listing Available Events +=========================== + +2.1 Standard Utilities +---------------------- + +All possible events are visible from /sys/kernel/debug/tracing/events. Simply +calling + + $ find /sys/kernel/debug/tracing/events -type d + +will give a fair indication of the number of events available. + +2.2 PCL +------- + +Discovery and enumeration of all counters and events, including tracepoints +are available with the perf tool. Getting a list of available events is a +simple case of + + $ perf list 2>&1 | grep Tracepoint + ext4:ext4_free_inode [Tracepoint event] + ext4:ext4_request_inode [Tracepoint event] + ext4:ext4_allocate_inode [Tracepoint event] + ext4:ext4_write_begin [Tracepoint event] + ext4:ext4_ordered_write_end [Tracepoint event] + [ .... remaining output snipped .... ] + + +2. Enabling Events +================== + +2.1 System-Wide Event Enabling +------------------------------ + +See Documentation/trace/events.txt for a proper description on how events +can be enabled system-wide. A short example of enabling all events related +to page allocation would look something like + + $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done + +2.2 System-Wide Event Enabling with SystemTap +--------------------------------------------- + +In SystemTap, tracepoints are accessible using the kernel.trace() function +call. The following is an example that reports every 5 seconds what processes +were allocating the pages. + + global page_allocs + + probe kernel.trace("mm_page_alloc") { + page_allocs[execname()]++ + } + + function print_count() { + printf ("%-25s %-s\n", "#Pages Allocated", "Process Name") + foreach (proc in page_allocs-) + printf("%-25d %s\n", page_allocs[proc], proc) + printf ("\n") + delete page_allocs + } + + probe timer.s(5) { + print_count() + } + +2.3 System-Wide Event Enabling with PCL +--------------------------------------- + +By specifying the -a switch and analysing sleep, the system-wide events +for a duration of time can be examined. + + $ perf stat -a \ + -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + sleep 10 + Performance counter stats for 'sleep 10': + + 9630 kmem:mm_page_alloc + 2143 kmem:mm_page_free_direct + 7424 kmem:mm_pagevec_free + + 10.002577764 seconds time elapsed + +Similarly, one could execute a shell and exit it as desired to get a report +at that point. + +2.4 Local Event Enabling +------------------------ + +Documentation/trace/ftrace.txt describes how to enable events on a per-thread +basis using set_ftrace_pid. + +2.5 Local Event Enablement with PCL +----------------------------------- + +Events can be activate and tracked for the duration of a process on a local +basis using PCL such as follows. + + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free ./hackbench 10 + Time: 0.909 + + Performance counter stats for './hackbench 10': + + 17803 kmem:mm_page_alloc + 12398 kmem:mm_page_free_direct + 4827 kmem:mm_pagevec_free + + 0.973913387 seconds time elapsed + +3. Event Filtering +================== + +Documentation/trace/ftrace.txt covers in-depth how to filter events in +ftrace. Obviously using grep and awk of trace_pipe is an option as well +as any script reading trace_pipe. + +4. Analysing Event Variances with PCL +===================================== + +Any workload can exhibit variances between runs and it can be important +to know what the standard deviation in. By and large, this is left to the +performance analyst to do it by hand. In the event that the discrete event +occurrences are useful to the performance analyst, then perf can be used. + + $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct + -e kmem:mm_pagevec_free ./hackbench 10 + Time: 0.890 + Time: 0.895 + Time: 0.915 + Time: 1.001 + Time: 0.899 + + Performance counter stats for './hackbench 10' (5 runs): + + 16630 kmem:mm_page_alloc ( +- 3.542% ) + 11486 kmem:mm_page_free_direct ( +- 4.771% ) + 4730 kmem:mm_pagevec_free ( +- 2.325% ) + + 0.982653002 seconds time elapsed ( +- 1.448% ) + +In the event that some higher-level event is required that depends on some +aggregation of discrete events, then a script would need to be developed. + +Using --repeat, it is also possible to view how events are fluctuating over +time on a system wide basis using -a and sleep. + + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + -a --repeat 10 \ + sleep 1 + Performance counter stats for 'sleep 1' (10 runs): + + 1066 kmem:mm_page_alloc ( +- 26.148% ) + 182 kmem:mm_page_free_direct ( +- 5.464% ) + 890 kmem:mm_pagevec_free ( +- 30.079% ) + + 1.002251757 seconds time elapsed ( +- 0.005% ) + +5. Higher-Level Analysis with Helper Scripts +============================================ + +When events are enabled the events that are triggering can be read from +/sys/kernel/debug/tracing/trace_pipe in human-readable format although binary +options exist as well. By post-processing the output, further information can +be gathered on-line as appropriate. Examples of post-processing might include + + o Reading information from /proc for the PID that triggered the event + o Deriving a higher-level event from a series of lower-level events. + o Calculate latencies between two events + +Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example +script that can read trace_pipe from STDIN or a copy of a trace. When used +on-line, it can be interrupted once to generate a report without existing +and twice to exit. + +Simplistically, the script just reads STDIN and counts up events but it +also can do more such as + + o Derive high-level events from many low-level events. If a number of pages + are freed to the main allocator from the per-CPU lists, it recognises + that as one per-CPU drain even though there is no specific tracepoint + for that event + o It can aggregate based on PID or individual process number + o In the event memory is getting externally fragmented, it reports + on whether the fragmentation event was severe or moderate. + o When receiving an event about a PID, it can record who the parent was so + that if large numbers of events are coming from very short-lived + processes, the parent process responsible for creating all the helpers + can be identified + +6. Lower-Level Analysis with PCL +================================ + +There may also be a requirement to identify what functions with a program +were generating events within the kernel. To begin this sort of analysis, the +data must be recorded. At the time of writing, this required root + + $ perf record -c 1 \ + -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + ./hackbench 10 + Time: 0.894 + [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ] + +Note the use of '-c 1' to set the event period to sample. The default sample +period is quite high to minimise overhead but the information collected can be +very coarse as a result. + +This record outputted a file called perf.data which can be analysed using +perf report. + + $ perf report + # Samples: 30922 + # + # Overhead Command Shared Object + # ........ ......... ................................ + # + 87.27% hackbench [vdso] + 6.85% hackbench /lib/i686/cmov/libc-2.9.so + 2.62% hackbench /lib/ld-2.9.so + 1.52% perf [vdso] + 1.22% hackbench ./hackbench + 0.48% hackbench [kernel] + 0.02% perf /lib/i686/cmov/libc-2.9.so + 0.01% perf /usr/bin/perf + 0.01% perf /lib/ld-2.9.so + 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so + # + # (For more details, try: perf report --sort comm,dso,symbol) + # + +According to this, the vast majority of events occured triggered on events +within the VDSO. With simple binaries, this will often be the case so lets +take a slightly different example. In the course of writing this, it was +noticed that X was generating an insane amount of page allocations so lets look +at it + + $ perf record -c 1 -f \ + -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + -p `pidof X` + +This was interrupted after a few seconds and + + $ perf report + # Samples: 27666 + # + # Overhead Command Shared Object + # ........ ....... ....................................... + # + 51.95% Xorg [vdso] + 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 + 0.09% Xorg /lib/i686/cmov/libc-2.9.so + 0.01% Xorg [kernel] + # + # (For more details, try: perf report --sort comm,dso,symbol) + # + +So, almost half of the events are occuring in a library. To get an idea which +symbol. + + $ perf report --sort comm,dso,symbol + # Samples: 27666 + # + # Overhead Command Shared Object Symbol + # ........ ....... ....................................... ...... + # + 51.95% Xorg [vdso] [.] 0x000000ffffe424 + 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2 + 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc + 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f + 0.01% Xorg [kernel] [k] read_hpet + 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path + 0.00% Xorg [kernel] [k] ftrace_trace_userstack + +To see where within the function pixmanFillsse2 things are going wrong + + $ perf annotate pixmanFillsse2 + [ ... ] + 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax) + : } + : + : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _ + : _mm_store_si128 (__m128i *__P, __m128i __B) : { + : *__P = __B; + 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax) + 0.00 : 34ef5: ff + 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax) + 0.00 : 34efd: ff + 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax) + 0.00 : 34f05: ff + 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax) + 0.00 : 34f0d: ff + 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax) + 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax) + 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax) + 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax) + +At a glance, it looks like the time is being spent copying pixmaps to +the card. Further investigation would be needed to determine why pixmaps +are being copied around so much but a starting point would be to take an +ancient build of libpixmap out of the library path where it was totally +forgotten about from months ago! diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 2f77ced35df..e57d6a9dd32 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -6,6 +6,8 @@ balance - various information on memory balancing. hugetlbpage.txt - a brief summary of hugetlbpage support in the Linux kernel. +ksm.txt + - how to use the Kernel Samepage Merging feature. locking - info on how locking and synchronization is done in the Linux vm code. numa @@ -20,3 +22,5 @@ slabinfo.c - source code for a tool to get reports about slabs. slub.txt - a short users guide for SLUB. +map_hugetlb.c + - an example program that uses the MAP_HUGETLB mmap flag. diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index ea8714fcc3a..82a7bd1800b 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -18,13 +18,13 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS automatically when CONFIG_HUGETLBFS is selected) configuration options. -The kernel built with hugepage support should show the number of configured -hugepages in the system by running the "cat /proc/meminfo" command. +The kernel built with huge page support should show the number of configured +huge pages in the system by running the "cat /proc/meminfo" command. /proc/meminfo also provides information about the total number of hugetlb pages configured in the kernel. It also displays information about the number of free hugetlb pages at any time. It also displays information about -the configured hugepage size - this is needed for generating the proper +the configured huge page size - this is needed for generating the proper alignment and size of the arguments to the above system calls. The output of "cat /proc/meminfo" will have lines like: @@ -37,25 +37,27 @@ HugePages_Surp: yyy Hugepagesize: zzz kB where: -HugePages_Total is the size of the pool of hugepages. -HugePages_Free is the number of hugepages in the pool that are not yet -allocated. -HugePages_Rsvd is short for "reserved," and is the number of hugepages -for which a commitment to allocate from the pool has been made, but no -allocation has yet been made. It's vaguely analogous to overcommit. -HugePages_Surp is short for "surplus," and is the number of hugepages in -the pool above the value in /proc/sys/vm/nr_hugepages. The maximum -number of surplus hugepages is controlled by -/proc/sys/vm/nr_overcommit_hugepages. +HugePages_Total is the size of the pool of huge pages. +HugePages_Free is the number of huge pages in the pool that are not yet + allocated. +HugePages_Rsvd is short for "reserved," and is the number of huge pages for + which a commitment to allocate from the pool has been made, + but no allocation has yet been made. Reserved huge pages + guarantee that an application will be able to allocate a + huge page from the pool of huge pages at fault time. +HugePages_Surp is short for "surplus," and is the number of huge pages in + the pool above the value in /proc/sys/vm/nr_hugepages. The + maximum number of surplus huge pages is controlled by + /proc/sys/vm/nr_overcommit_hugepages. /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb pages in the kernel. Super user can dynamically request more (or free some -pre-configured) hugepages. +pre-configured) huge pages. The allocation (or deallocation) of hugetlb pages is possible only if there are -enough physically contiguous free pages in system (freeing of hugepages is +enough physically contiguous free pages in system (freeing of huge pages is possible only if there are enough hugetlb pages free that can be transferred back to regular memory pool). @@ -67,43 +69,82 @@ use either the mmap system call or shared memory system calls to start using the huge pages. It is required that the system administrator preallocate enough memory for huge page purposes. -Use the following command to dynamically allocate/deallocate hugepages: +The administrator can preallocate huge pages on the kernel boot command line by +specifying the "hugepages=N" parameter, where 'N' = the number of huge pages +requested. This is the most reliable method for preallocating huge pages as +memory has not yet become fragmented. + +Some platforms support multiple huge page sizes. To preallocate huge pages +of a specific size, one must preceed the huge pages boot command parameters +with a huge page size selection parameter "hugepagesz=<size>". <size> must +be specified in bytes with optional scale suffix [kKmMgG]. The default huge +page size may be selected with the "default_hugepagesz=<size>" boot parameter. + +/proc/sys/vm/nr_hugepages indicates the current number of configured [default +size] hugetlb pages in the kernel. Super user can dynamically request more +(or free some pre-configured) huge pages. + +Use the following command to dynamically allocate/deallocate default sized +huge pages: echo 20 > /proc/sys/vm/nr_hugepages -This command will try to configure 20 hugepages in the system. The success -or failure of allocation depends on the amount of physically contiguous -memory that is preset in system at this time. System administrators may want -to put this command in one of the local rc init files. This will enable the -kernel to request huge pages early in the boot process (when the possibility -of getting physical contiguous pages is still very high). In either -case, administrators will want to verify the number of hugepages actually -allocated by checking the sysctl or meminfo. - -/proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of -hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are -requested by applications. echo'ing any non-zero value into this file -indicates that the hugetlb subsystem is allowed to try to obtain -hugepages from the buddy allocator, if the normal pool is exhausted. As -these surplus hugepages go out of use, they are freed back to the buddy +This command will try to configure 20 default sized huge pages in the system. +On a NUMA platform, the kernel will attempt to distribute the huge page pool +over the all on-line nodes. These huge pages, allocated when nr_hugepages +is increased, are called "persistent huge pages". + +The success or failure of huge page allocation depends on the amount of +physically contiguous memory that is preset in system at the time of the +allocation attempt. If the kernel is unable to allocate huge pages from +some nodes in a NUMA system, it will attempt to make up the difference by +allocating extra pages on other nodes with sufficient available contiguous +memory, if any. + +System administrators may want to put this command in one of the local rc init +files. This will enable the kernel to request huge pages early in the boot +process when the possibility of getting physical contiguous pages is still +very high. Administrators can verify the number of huge pages actually +allocated by checking the sysctl or meminfo. To check the per node +distribution of huge pages in a NUMA system, use: + + cat /sys/devices/system/node/node*/meminfo | fgrep Huge + +/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of +huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are +requested by applications. Writing any non-zero value into this file +indicates that the hugetlb subsystem is allowed to try to obtain "surplus" +huge pages from the buddy allocator, when the normal pool is exhausted. As +these surplus huge pages go out of use, they are freed back to the buddy allocator. +When increasing the huge page pool size via nr_hugepages, any surplus +pages will first be promoted to persistent huge pages. Then, additional +huge pages will be allocated, if necessary and if possible, to fulfill +the new huge page pool size. + +The administrator may shrink the pool of preallocated huge pages for +the default huge page size by setting the nr_hugepages sysctl to a +smaller value. The kernel will attempt to balance the freeing of huge pages +across all on-line nodes. Any free huge pages on the selected nodes will +be freed back to the buddy allocator. + Caveat: Shrinking the pool via nr_hugepages such that it becomes less -than the number of hugepages in use will convert the balance to surplus +than the number of huge pages in use will convert the balance to surplus huge pages even if it would exceed the overcommit value. As long as this condition holds, however, no more surplus huge pages will be allowed on the system until one of the two sysctls are increased sufficiently, or the surplus huge pages go out of use and are freed. -With support for multiple hugepage pools at run-time available, much of -the hugepage userspace interface has been duplicated in sysfs. The above -information applies to the default hugepage size (which will be -controlled by the proc interfaces for backwards compatibility). The root -hugepage control directory is +With support for multiple huge page pools at run-time available, much of +the huge page userspace interface has been duplicated in sysfs. The above +information applies to the default huge page size which will be +controlled by the /proc interfaces for backwards compatibility. The root +huge page control directory in sysfs is: /sys/kernel/mm/hugepages -For each hugepage size supported by the running kernel, a subdirectory +For each huge page size supported by the running kernel, a subdirectory will exist, of the form hugepages-${size}kB @@ -116,9 +157,9 @@ Inside each of these directories, the same set of files will exist: resv_hugepages surplus_hugepages -which function as described above for the default hugepage-sized case. +which function as described above for the default huge page-sized case. -If the user applications are going to request hugepages using mmap system +If the user applications are going to request huge pages using mmap system call, then it is required that system administrator mount a file system of type hugetlbfs: @@ -127,7 +168,7 @@ type hugetlbfs: none /mnt/huge This command mounts a (pseudo) filesystem of type hugetlbfs on the directory -/mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid +/mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid options sets the owner and group of the root of the file system. By default the uid and gid of the current process are taken. The mode option sets the mode of root of file system to value & 0777. This value is given in octal. @@ -146,24 +187,26 @@ Regular chown, chgrp, and chmod commands (with right permissions) could be used to change the file attributes on hugetlbfs. Also, it is important to note that no such mount command is required if the -applications are going to use only shmat/shmget system calls. Users who -wish to use hugetlb page via shared memory segment should be a member of -a supplementary group and system admin needs to configure that gid into -/proc/sys/vm/hugetlb_shm_group. It is possible for same or different -applications to use any combination of mmaps and shm* calls, though the -mount of filesystem will be required for using mmap calls. +applications are going to use only shmat/shmget system calls or mmap with +MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment +should be a member of a supplementary group and system admin needs to +configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for +same or different applications to use any combination of mmaps and shm* +calls, though the mount of filesystem will be required for using mmap calls +without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see +map_hugetlb.c. ******************************************************************* /* - * Example of using hugepage memory in a user application using Sys V shared + * Example of using huge page memory in a user application using Sys V shared * memory system calls. In this example the app is requesting 256MB of * memory that is backed by huge pages. The application uses the flag * SHM_HUGETLB in the shmget system call to inform the kernel that it is - * requesting hugepages. + * requesting huge pages. * * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * hugepages. That means the addresses starting with 0x800000... will need + * huge pages. That means the addresses starting with 0x800000... will need * to be specified. Specifying a fixed address is not required on ppc64, * i386 or x86_64. * @@ -252,14 +295,14 @@ int main(void) ******************************************************************* /* - * Example of using hugepage memory in a user application using the mmap + * Example of using huge page memory in a user application using the mmap * system call. Before running this application, make sure that the * administrator has mounted the hugetlbfs filesystem (on some directory * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this * example, the app is requesting memory of size 256MB that is backed by * huge pages. * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. * That means the addresses starting with 0x800000... will need to be * specified. Specifying a fixed address is not required on ppc64, i386 * or x86_64. diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt new file mode 100644 index 00000000000..72a22f65960 --- /dev/null +++ b/Documentation/vm/ksm.txt @@ -0,0 +1,89 @@ +How to use the Kernel Samepage Merging feature +---------------------------------------------- + +KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, +added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation, +and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/ + +The KSM daemon ksmd periodically scans those areas of user memory which +have been registered with it, looking for pages of identical content which +can be replaced by a single write-protected page (which is automatically +copied if a process later wants to update its content). + +KSM was originally developed for use with KVM (where it was known as +Kernel Shared Memory), to fit more virtual machines into physical memory, +by sharing the data common between them. But it can be useful to any +application which generates many instances of the same data. + +KSM only merges anonymous (private) pages, never pagecache (file) pages. +KSM's merged pages are at present locked into kernel memory for as long +as they are shared: so cannot be swapped out like the user pages they +replace (but swapping KSM pages should follow soon in a later release). + +KSM only operates on those areas of address space which an application +has advised to be likely candidates for merging, by using the madvise(2) +system call: int madvise(addr, length, MADV_MERGEABLE). + +The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel +that advice and restore unshared pages: whereupon KSM unmerges whatever +it merged in that range. Note: this unmerging call may suddenly require +more memory than is available - possibly failing with EAGAIN, but more +probably arousing the Out-Of-Memory killer. + +If KSM is not configured into the running kernel, madvise MADV_MERGEABLE +and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was +built with CONFIG_KSM=y, those calls will normally succeed: even if the +the KSM daemon is not currently running, MADV_MERGEABLE still registers +the range for whenever the KSM daemon is started; even if the range +cannot contain any pages which KSM could actually merge; even if +MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. + +Like other madvise calls, they are intended for use on mapped areas of +the user address space: they will report ENOMEM if the specified range +includes unmapped gaps (though working on the intervening mapped areas), +and might fail with EAGAIN if not enough memory for internal structures. + +Applications should be considerate in their use of MADV_MERGEABLE, +restricting its use to areas likely to benefit. KSM's scans may use +a lot of processing power, and its kernel-resident pages are a limited +resource. Some installations will disable KSM for these reasons. + +The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, +readable by all but writable only by root: + +max_kernel_pages - set to maximum number of kernel pages that KSM may use + e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages" + Value 0 imposes no limit on the kernel pages KSM may use; + but note that any process using MADV_MERGEABLE can cause + KSM to allocate these pages, unswappable until it exits. + Default: 2000 (chosen for demonstration purposes) + +pages_to_scan - how many present pages to scan before ksmd goes to sleep + e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan" + Default: 200 (chosen for demonstration purposes) + +sleep_millisecs - how many milliseconds ksmd should sleep before next scan + e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" + Default: 20 (chosen for demonstration purposes) + +run - set 0 to stop ksmd from running but keep merged pages, + set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", + set 2 to stop ksmd and unmerge all pages currently merged, + but leave mergeable areas registered for next run + Default: 1 (for immediate use by apps which register) + +The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: + +pages_shared - how many shared unswappable kernel pages KSM is using +pages_sharing - how many more sites are sharing them i.e. how much saved +pages_unshared - how many pages unique but repeatedly checked for merging +pages_volatile - how many pages changing too fast to be placed in a tree +full_scans - how many times all mergeable areas have been scanned + +A high ratio of pages_sharing to pages_shared indicates good sharing, but +a high ratio of pages_unshared to pages_sharing indicates wasted effort. +pages_volatile embraces several different kinds of activity, but a high +proportion there would also indicate poor use of madvise MADV_MERGEABLE. + +Izik Eidus, +Hugh Dickins, 30 July 2009 diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c new file mode 100644 index 00000000000..e2bdae37f49 --- /dev/null +++ b/Documentation/vm/map_hugetlb.c @@ -0,0 +1,77 @@ +/* + * Example of using hugepage memory in a user application using the mmap + * system call with MAP_HUGETLB flag. Before running this program make + * sure the administrator has allocated enough default sized huge pages + * to cover the 256 MB allocation. + * + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40 +#endif + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +void read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + break; + } +} + +int main(void) +{ + void *addr; + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + read_bytes(addr); + + munmap(addr, LENGTH); + + return 0; +} diff --git a/MAINTAINERS b/MAINTAINERS index 751a307dc44..d24c8823a8c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -233,6 +233,7 @@ S: Supported F: drivers/acpi/ F: drivers/pnp/pnpacpi/ F: include/linux/acpi.h +F: include/acpi/ ACPI BATTERY DRIVERS M: Alexey Starikovskiy <astarikovskiy@suse.de> @@ -497,7 +498,7 @@ F: arch/arm/include/asm/floppy.h ARM PORT M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained F: arch/arm/ @@ -508,36 +509,36 @@ F: drivers/mmc/host/mmci.* ARM/ADI ROADRUNNER MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-ixp23xx/ F: arch/arm/mach-ixp23xx/include/mach/ ARM/ADS SPHERE MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/AFEB9260 MACHINE SUPPORT M: Sergey Lapin <slapin@ossfans.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/AJECO 1ARM MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/ATMEL AT91RM9200 ARM ARCHITECTURE M: Andrew Victor <linux@maxim.org.za> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://maxim.org.za/at91_26.html S: Maintained ARM/BCMRING ARM ARCHITECTURE M: Leo Chen <leochen@broadcom.com> M: Scott Branden <sbranden@broadcom.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-bcmring @@ -554,25 +555,25 @@ F: drivers/mtd/nand/nand_bcm_umi.h ARM/CIRRUS LOGIC EP93XX ARM ARCHITECTURE M: Hartley Sweeten <hsweeten@visionengravers.com> M: Ryan Mallon <ryan@bluewatersys.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-ep93xx/ F: arch/arm/mach-ep93xx/include/mach/ ARM/CIRRUS LOGIC EDB9315A MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/CLKDEV SUPPORT M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) F: arch/arm/common/clkdev.c F: arch/arm/include/asm/clkdev.h ARM/COMPULAB CM-X270/EM-X270 and CM-X300 MACHINE SUPPORT M: Mike Rapoport <mike@compulab.co.il> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/CORGI MACHINE SUPPORT @@ -581,14 +582,14 @@ S: Maintained ARM/CORTINA SYSTEMS GEMINI ARM ARCHITECTURE M: Paulius Zaleckas <paulius.zaleckas@teltonika.lt> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://gitorious.org/linux-gemini/mainline.git S: Maintained F: arch/arm/mach-gemini/ ARM/EBSA110 MACHINE SUPPORT M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained F: arch/arm/mach-ebsa110/ @@ -606,13 +607,13 @@ F: arch/arm/mach-pxa/ezx.c ARM/FARADAY FA526 PORT M: Paulius Zaleckas <paulius.zaleckas@teltonika.lt> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mm/*-fa* ARM/FOOTBRIDGE ARCHITECTURE M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained F: arch/arm/include/asm/hardware/dec21285.h @@ -620,17 +621,17 @@ F: arch/arm/mach-footbridge/ ARM/FREESCALE IMX / MXC ARM ARCHITECTURE M: Sascha Hauer <kernel@pengutronix.de> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/GLOMATION GESBC9312SX MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/GUMSTIX MACHINE SUPPORT M: Steve Sakoman <sakoman@gmail.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/H4700 (HP IPAQ HX4700) MACHINE SUPPORT @@ -650,55 +651,55 @@ F: arch/arm/mach-sa1100/include/mach/jornada720.h ARM/INTEL IOP32X ARM ARCHITECTURE M: Lennert Buytenhek <kernel@wantstofly.org> M: Dan Williams <dan.j.williams@intel.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported ARM/INTEL IOP33X ARM ARCHITECTURE M: Dan Williams <dan.j.williams@intel.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported ARM/INTEL IOP13XX ARM ARCHITECTURE M: Lennert Buytenhek <kernel@wantstofly.org> M: Dan Williams <dan.j.williams@intel.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported ARM/INTEL IQ81342EX MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> M: Dan Williams <dan.j.williams@intel.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported ARM/INTEL IXP2000 ARM ARCHITECTURE M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/INTEL IXDP2850 MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/INTEL IXP23XX ARM ARCHITECTURE M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/INTEL XSC3 (MANZANO) ARM CORE M: Lennert Buytenhek <kernel@wantstofly.org> M: Dan Williams <dan.j.williams@intel.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported ARM/IP FABRICS DOUBLE ESPRESSO MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/LOGICPD PXA270 MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/MAGICIAN MACHINE SUPPORT @@ -708,7 +709,7 @@ S: Maintained ARM/Marvell Loki/Kirkwood/MV78xx0/Orion SOC support M: Lennert Buytenhek <buytenh@marvell.com> M: Nicolas Pitre <nico@marvell.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://git.marvell.com/orion S: Maintained F: arch/arm/mach-loki/ @@ -719,7 +720,7 @@ F: arch/arm/plat-orion/ ARM/MIOA701 MACHINE SUPPORT M: Robert Jarzmik <robert.jarzmik@free.fr> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) F: arch/arm/mach-pxa/mioa701.c S: Maintained @@ -760,18 +761,18 @@ S: Maintained ARM/PT DIGITAL BOARD PORT M: Stefan Eletzhofer <stefan.eletzhofer@eletztrick.de> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained ARM/RADISYS ENP2611 MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/RISCPC ARCHITECTURE M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained F: arch/arm/common/time-acorn.c @@ -790,7 +791,7 @@ S: Maintained ARM/SAMSUNG ARM ARCHITECTURES M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/plat-s3c/ @@ -798,65 +799,65 @@ F: arch/arm/plat-s3c24xx/ ARM/S3C2410 ARM ARCHITECTURE M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c2410/ ARM/S3C2440 ARM ARCHITECTURE M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c2440/ ARM/S3C2442 ARM ARCHITECTURE M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c2442/ ARM/S3C2443 ARM ARCHITECTURE M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c2443/ ARM/S3C6400 ARM ARCHITECTURE M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c6400/ ARM/S3C6410 ARM ARCHITECTURE M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/mach-s3c6410/ ARM/TECHNOLOGIC SYSTEMS TS7250 MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/THECUS N2100 MACHINE SUPPORT M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained ARM/NUVOTON W90X900 ARM ARCHITECTURE M: Wan ZongShun <mcuos.com@gmail.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.mcuos.com S: Maintained ARM/VFP SUPPORT M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained F: arch/arm/vfp/ @@ -963,7 +964,7 @@ F: include/linux/atm* ATMEL AT91 MCI DRIVER M: Nicolas Ferre <nicolas.ferre@atmel.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.atmel.com/products/AT91/ W: http://www.at91.com/ S: Maintained @@ -1541,7 +1542,7 @@ F: drivers/infiniband/hw/cxgb3/ CYBERPRO FB DRIVER M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.arm.linux.org.uk/ S: Maintained F: drivers/video/cyber2000fb.* @@ -2085,7 +2086,7 @@ F: drivers/i2c/busses/i2c-cpm.c FREESCALE IMX / MXC FRAMEBUFFER DRIVER M: Sascha Hauer <kernel@pengutronix.de> L: linux-fbdev-devel@lists.sourceforge.net (moderated for non-subscribers) -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/plat-mxc/include/mach/imxfb.h F: drivers/video/imxfb.c @@ -2803,6 +2804,8 @@ L: netdev@vger.kernel.org L: lvs-devel@vger.kernel.org S: Maintained F: Documentation/networking/ipvs-sysctl.txt +F: include/net/ip_vs.h +F: include/linux/ip_vs.h F: net/netfilter/ipvs/ IPWIRELESS DRIVER @@ -2955,7 +2958,7 @@ F: scripts/Makefile.* KERNEL JANITORS L: kernel-janitors@vger.kernel.org W: http://www.kerneljanitors.org/ -S: Odd fixes +S: Maintained KERNEL NFSD, SUNRPC, AND LOCKD SERVERS M: "J. Bruce Fields" <bfields@fieldses.org> @@ -3449,7 +3452,7 @@ F: include/linux/meye.h MOTOROLA IMX MMC/SD HOST CONTROLLER INTERFACE DRIVER M: Pavel Pisa <ppisa@pikron.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: drivers/mmc/host/imxmmc.* @@ -3734,7 +3737,7 @@ W: http://www.muru.com/linux/omap/ W: http://linux.omap.com/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap-2.6.git S: Maintained -F: arch/arm/*omap* +F: arch/arm/*omap*/ OMAP CLOCK FRAMEWORK SUPPORT M: Paul Walmsley <paul@pwsan.com> @@ -4025,8 +4028,7 @@ F: drivers/block/pktcdvd.c F: include/linux/pktcdvd.h PMC SIERRA MaxRAID DRIVER -P: Anil Ravindranath -M: anil_ravindranath@pmc-sierra.com +M: Anil Ravindranath <anil_ravindranath@pmc-sierra.com> L: linux-scsi@vger.kernel.org W: http://www.pmc-sierra.com/ S: Supported @@ -4168,7 +4170,7 @@ F: drivers/media/video/pvrusb2/ PXA2xx/PXA3xx SUPPORT M: Eric Miao <eric.y.miao@gmail.com> M: Russell King <linux@arm.linux.org.uk> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-pxa/ F: drivers/pcmcia/pxa2xx* @@ -4181,13 +4183,13 @@ F: sound/soc/pxa PXA168 SUPPORT M: Eric Miao <eric.y.miao@gmail.com> M: Jason Chagas <jason.chagas@marvell.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://git.kernel.org/pub/scm/linux/kernel/git/ycmiao/pxa-linux-2.6.git S: Maintained PXA910 SUPPORT M: Eric Miao <eric.y.miao@gmail.com> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://git.kernel.org/pub/scm/linux/kernel/git/ycmiao/pxa-linux-2.6.git S: Maintained @@ -4428,7 +4430,7 @@ F: net/iucv/ S3C24XX SD/MMC Driver M: Ben Dooks <ben-linux@fluff.org> -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: drivers/mmc/host/s3cmci.* @@ -4533,20 +4535,20 @@ S: Maintained F: drivers/mmc/host/sdricoh_cs.c SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) DRIVER -M: Pierre Ossman <pierre@ossman.eu> -L: sdhci-devel@lists.ossman.eu -S: Maintained +S: Orphan +L: linux-mmc@vger.kernel.org +F: drivers/mmc/host/sdhci.* SECURE DIGITAL HOST CONTROLLER INTERFACE, OPEN FIRMWARE BINDINGS (SDHCI-OF) M: Anton Vorontsov <avorontsov@ru.mvista.com> L: linuxppc-dev@ozlabs.org -L: sdhci-devel@lists.ossman.eu +L: linux-mmc@vger.kernel.org S: Maintained -F: drivers/mmc/host/sdhci.* +F: drivers/mmc/host/sdhci-of.* SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) SAMSUNG DRIVER M: Ben Dooks <ben-linux@fluff.org> -L: sdhci-devel@lists.ossman.eu +L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-s3c.c @@ -4632,7 +4634,7 @@ F: drivers/misc/sgi-xp/ SHARP LH SUPPORT (LH7952X & LH7A40X) M: Marc Singer <elf@buici.com> W: http://projects.buici.com/arm -L: linux-arm-kernel@lists.arm.linux.org.uk (subscribers-only) +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen F: arch/arm/mach-lh7a40x/ @@ -5638,6 +5640,12 @@ L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/wd7000.c +WINBOND CIR DRIVER +P: David Härdeman +M: david@hardeman.nu +S: Maintained +F: drivers/input/misc/winbond-cir.c + WIMAX STACK M: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> M: linux-wimax@intel.com @@ -5657,8 +5665,7 @@ S: Maintained F: drivers/input/misc/wistron_btns.c WL1251 WIRELESS DRIVER -P: Kalle Valo -M: kalle.valo@nokia.com +M: Kalle Valo <kalle.valo@nokia.com> L: linux-wireless@vger.kernel.org W: http://wireless.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c39..443448154f3 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -45,6 +45,14 @@ config GENERIC_CALIBRATE_DELAY bool default y +config GENERIC_TIME + bool + default y + +config ARCH_USES_GETTIMEOFFSET + bool + default y + config ZONE_DMA bool default y diff --git a/arch/alpha/boot/tools/objstrip.c b/arch/alpha/boot/tools/objstrip.c index ef183823029..9d0727d18ae 100644 --- a/arch/alpha/boot/tools/objstrip.c +++ b/arch/alpha/boot/tools/objstrip.c @@ -93,7 +93,7 @@ main (int argc, char *argv[]) ofd = 1; if (i < argc) { ofd = open(argv[i++], O_WRONLY | O_CREAT | O_TRUNC, 0666); - if (fd == -1) { + if (ofd == -1) { perror("open"); exit(1); } diff --git a/arch/alpha/include/asm/hardirq.h b/arch/alpha/include/asm/hardirq.h index 88971460fa6..242c09ba98c 100644 --- a/arch/alpha/include/asm/hardirq.h +++ b/arch/alpha/include/asm/hardirq.h @@ -1,17 +1,9 @@ #ifndef _ALPHA_HARDIRQ_H #define _ALPHA_HARDIRQ_H -#include <linux/threads.h> -#include <linux/cache.h> - - -/* entry.S is sensitive to the offsets of these fields */ -typedef struct { - unsigned long __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ - void ack_bad_irq(unsigned int irq); +#define ack_bad_irq ack_bad_irq + +#include <asm-generic/hardirq.h> #endif /* _ALPHA_HARDIRQ_H */ diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index 90d7c35d286..99c56d47879 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h @@ -28,6 +28,8 @@ #define MAP_NORESERVE 0x10000 /* don't check for reservations */ #define MAP_POPULATE 0x20000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ +#define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x100000 /* create a huge page mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ @@ -48,6 +50,9 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ +#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index bfb880af959..d15aedfe606 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -268,11 +268,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, assume it doesn't support sg mapping, and, since we tried to use direct_map above, it now must be considered an error. */ if (! alpha_mv.mv_pci_tbi) { - static int been_here = 0; /* Only print the message once. */ - if (!been_here) { - printk(KERN_WARNING "pci_map_single: no HW sg\n"); - been_here = 1; - } + printk_once(KERN_WARNING "pci_map_single: no HW sg\n"); return 0; } diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index b04e2cbf23a..5d0826654c6 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -408,28 +408,17 @@ time_init(void) * part. So we can't do the "find absolute time in terms of cycles" thing * that the other ports do. */ -void -do_gettimeofday(struct timeval *tv) +u32 arch_gettimeoffset(void) { - unsigned long flags; - unsigned long sec, usec, seq; - unsigned long delta_cycles, delta_usec, partial_tick; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - - delta_cycles = rpcc() - state.last_time; - sec = xtime.tv_sec; - usec = (xtime.tv_nsec / 1000); - partial_tick = state.partial_tick; - - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - #ifdef CONFIG_SMP /* Until and unless we figure out how to get cpu cycle counters in sync and keep them there, we can't use the rpcc tricks. */ - delta_usec = 0; + return 0; #else + unsigned long delta_cycles, delta_usec, partial_tick; + + delta_cycles = rpcc() - state.last_time; + partial_tick = state.partial_tick; /* * usec = cycles * ticks_per_cycle * 2**48 * 1e6 / (2**48 * ticks) * = cycles * (s_t_p_c) * 1e6 / (2**48 * ticks) @@ -446,64 +435,10 @@ do_gettimeofday(struct timeval *tv) delta_usec = (delta_cycles * state.scaled_ticks_per_cycle + partial_tick) * 15625; delta_usec = ((delta_usec / ((1UL << (FIX_SHIFT-6-1)) * HZ)) + 1) / 2; + return delta_usec * 1000; #endif - - usec += delta_usec; - if (usec >= 1000000) { - sec += 1; - usec -= 1000000; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; } -EXPORT_SYMBOL(do_gettimeofday); - -int -do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - unsigned long delta_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - /* The offset that is added into time in do_gettimeofday above - must be subtracted out here to keep a coherent view of the - time. Without this, a full-tick error is possible. */ - -#ifdef CONFIG_SMP - delta_nsec = 0; -#else - delta_nsec = rpcc() - state.last_time; - delta_nsec = (delta_nsec * state.scaled_ticks_per_cycle - + state.partial_tick) * 15625; - delta_nsec = ((delta_nsec / ((1UL << (FIX_SHIFT-6-1)) * HZ)) + 1) / 2; - delta_nsec *= 1000; -#endif - - nsec -= delta_nsec; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - - /* * In order to set the CMOS clock precisely, set_rtc_mmss has to be * called 500 ms after the second nowtime has started, because when diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index af71d38c8e4..a0902c20d67 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -299,7 +299,7 @@ printk_memory_info(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, %luk data, %luk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_mapnr << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 0eab5574942..10b403554b6 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -349,7 +349,7 @@ void __init mem_init(void) printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, " "%luk data, %luk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 7350557a81e..54661125a8b 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -25,7 +25,7 @@ KBUILD_CFLAGS +=$(call cc-option,-marm,) # Select a platform tht is kept up-to-date KBUILD_DEFCONFIG := versatile_defconfig -# defines filename extension depending memory manement type. +# defines filename extension depending memory management type. ifeq ($(CONFIG_MMU),) MMUEXT := -nommu endif diff --git a/arch/arm/include/asm/mman.h b/arch/arm/include/asm/mman.h index fc26976d8e3..8eebf89f5ab 100644 --- a/arch/arm/include/asm/mman.h +++ b/arch/arm/include/asm/mman.h @@ -1,17 +1 @@ -#ifndef __ARM_MMAN_H__ -#define __ARM_MMAN_H__ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) page tables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __ARM_MMAN_H__ */ +#include <asm-generic/mman.h> diff --git a/arch/arm/mach-at91/board-afeb-9260v1.c b/arch/arm/mach-at91/board-afeb-9260v1.c index 61e52b66bc7..50667bed7cc 100644 --- a/arch/arm/mach-at91/board-afeb-9260v1.c +++ b/arch/arm/mach-at91/board-afeb-9260v1.c @@ -53,7 +53,7 @@ static void __init afeb9260_map_io(void) /* Initialize processor: 18.432 MHz crystal */ at91sam9260_initialize(18432000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, CTS, RTS, DTR, DSR, DCD, RI) */ diff --git a/arch/arm/mach-at91/board-cam60.c b/arch/arm/mach-at91/board-cam60.c index d3ba29c5d8c..02138af631e 100644 --- a/arch/arm/mach-at91/board-cam60.c +++ b/arch/arm/mach-at91/board-cam60.c @@ -50,7 +50,7 @@ static void __init cam60_map_io(void) /* Initialize processor: 10 MHz crystal */ at91sam9260_initialize(10000000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* set serial console to ttyS0 (ie, DBGU) */ diff --git a/arch/arm/mach-at91/board-neocore926.c b/arch/arm/mach-at91/board-neocore926.c index 9ba7ba2cc3b..8c0b71c95be 100644 --- a/arch/arm/mach-at91/board-neocore926.c +++ b/arch/arm/mach-at91/board-neocore926.c @@ -56,7 +56,7 @@ static void __init neocore926_map_io(void) /* Initialize processor: 20 MHz crystal */ at91sam9263_initialize(20000000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, RTS, CTS) */ diff --git a/arch/arm/mach-at91/board-qil-a9260.c b/arch/arm/mach-at91/board-qil-a9260.c index 4cff9a7e61d..664938e8f66 100644 --- a/arch/arm/mach-at91/board-qil-a9260.c +++ b/arch/arm/mach-at91/board-qil-a9260.c @@ -53,7 +53,7 @@ static void __init ek_map_io(void) /* Initialize processor: 12.000 MHz crystal */ at91sam9260_initialize(12000000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, CTS, RTS, DTR, DSR, DCD, RI) */ diff --git a/arch/arm/mach-at91/board-sam9260ek.c b/arch/arm/mach-at91/board-sam9260ek.c index 93a0f8b100e..ba9d501b5c5 100644 --- a/arch/arm/mach-at91/board-sam9260ek.c +++ b/arch/arm/mach-at91/board-sam9260ek.c @@ -54,7 +54,7 @@ static void __init ek_map_io(void) /* Initialize processor: 18.432 MHz crystal */ at91sam9260_initialize(18432000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, CTS, RTS, DTR, DSR, DCD, RI) */ diff --git a/arch/arm/mach-at91/board-sam9261ek.c b/arch/arm/mach-at91/board-sam9261ek.c index f9b19993a7a..c4c8865d52d 100644 --- a/arch/arm/mach-at91/board-sam9261ek.c +++ b/arch/arm/mach-at91/board-sam9261ek.c @@ -61,7 +61,7 @@ static void __init ek_map_io(void) /* Setup the LEDs */ at91_init_leds(AT91_PIN_PA13, AT91_PIN_PA14); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* set serial console to ttyS0 (ie, DBGU) */ diff --git a/arch/arm/mach-at91/board-sam9263ek.c b/arch/arm/mach-at91/board-sam9263ek.c index 1bf7bd4cbe1..26f1aa6049a 100644 --- a/arch/arm/mach-at91/board-sam9263ek.c +++ b/arch/arm/mach-at91/board-sam9263ek.c @@ -57,7 +57,7 @@ static void __init ek_map_io(void) /* Initialize processor: 16.367 MHz crystal */ at91sam9263_initialize(16367660); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, RTS, CTS) */ diff --git a/arch/arm/mach-at91/board-sam9g20ek.c b/arch/arm/mach-at91/board-sam9g20ek.c index ca470d504ea..29cf8317748 100644 --- a/arch/arm/mach-at91/board-sam9g20ek.c +++ b/arch/arm/mach-at91/board-sam9g20ek.c @@ -50,7 +50,7 @@ static void __init ek_map_io(void) /* Initialize processor: 18.432 MHz crystal */ at91sam9260_initialize(18432000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, CTS, RTS, DTR, DSR, DCD, RI) */ diff --git a/arch/arm/mach-at91/board-sam9rlek.c b/arch/arm/mach-at91/board-sam9rlek.c index 9d07679efce..94ffb5c103b 100644 --- a/arch/arm/mach-at91/board-sam9rlek.c +++ b/arch/arm/mach-at91/board-sam9rlek.c @@ -43,7 +43,7 @@ static void __init ek_map_io(void) /* Initialize processor: 12.000 MHz crystal */ at91sam9rl_initialize(12000000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* USART0 on ttyS1. (Rx, Tx, CTS, RTS) */ diff --git a/arch/arm/mach-at91/board-usb-a9260.c b/arch/arm/mach-at91/board-usb-a9260.c index d13304c0bc4..905d6ef7680 100644 --- a/arch/arm/mach-at91/board-usb-a9260.c +++ b/arch/arm/mach-at91/board-usb-a9260.c @@ -53,7 +53,7 @@ static void __init ek_map_io(void) /* Initialize processor: 12.000 MHz crystal */ at91sam9260_initialize(12000000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* set serial console to ttyS0 (ie, DBGU) */ diff --git a/arch/arm/mach-at91/board-usb-a9263.c b/arch/arm/mach-at91/board-usb-a9263.c index d96405b7d57..b6a3480383e 100644 --- a/arch/arm/mach-at91/board-usb-a9263.c +++ b/arch/arm/mach-at91/board-usb-a9263.c @@ -52,7 +52,7 @@ static void __init ek_map_io(void) /* Initialize processor: 12.00 MHz crystal */ at91sam9263_initialize(12000000); - /* DGBU on ttyS0. (Rx & Tx only) */ + /* DBGU on ttyS0. (Rx & Tx only) */ at91_register_uart(0, 0, 0); /* set serial console to ttyS0 (ie, DBGU) */ diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index ea36186f32c..f982606d7bf 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -596,8 +596,8 @@ void __init mem_init(void) printk(KERN_NOTICE "Memory: %luKB available (%dK code, " "%dK data, %dK init, %luK highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, initsize >> 10, + nr_free_pages() << (PAGE_SHIFT-10), codesize >> 10, + datasize >> 10, initsize >> 10, (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))); if (PAGE_SIZE >= 16384 && num_physpages <= 128) { diff --git a/arch/avr32/include/asm/mman.h b/arch/avr32/include/asm/mman.h index 9a92b15f6a6..8eebf89f5ab 100644 --- a/arch/avr32/include/asm/mman.h +++ b/arch/avr32/include/asm/mman.h @@ -1,17 +1 @@ -#ifndef __ASM_AVR32_MMAN_H__ -#define __ASM_AVR32_MMAN_H__ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) page tables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __ASM_AVR32_MMAN_H__ */ +#include <asm-generic/mman.h> diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index e819fa69a90..376f18c4a6c 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -141,7 +141,7 @@ void __init mem_init(void) printk ("Memory: %luk/%luk available (%dk kernel code, " "%dk reserved, %dk data, %dk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT - 10), + nr_free_pages() << (PAGE_SHIFT - 10), totalram_pages << (PAGE_SHIFT - 10), codesize >> 10, reservedpages << (PAGE_SHIFT - 10), diff --git a/arch/blackfin/mach-bf538/include/mach/defBF539.h b/arch/blackfin/mach-bf538/include/mach/defBF539.h index bdc330cd0e1..1c58914a874 100644 --- a/arch/blackfin/mach-bf538/include/mach/defBF539.h +++ b/arch/blackfin/mach-bf538/include/mach/defBF539.h @@ -2325,7 +2325,7 @@ #define AMBEN_B0_B1 0x0004 /* Enable Asynchronous Memory Banks 0 & 1 only */ #define AMBEN_B0_B1_B2 0x0006 /* Enable Asynchronous Memory Banks 0, 1, and 2 */ #define AMBEN_ALL 0x0008 /* Enable Asynchronous Memory Banks (all) 0, 1, 2, and 3 */ -#define CDPRIO 0x0100 /* DMA has priority over core for for external accesses */ +#define CDPRIO 0x0100 /* DMA has priority over core for external accesses */ /* EBIU_AMGCTL Bit Positions */ #define AMCKEN_P 0x0000 /* Enable CLKOUT */ diff --git a/arch/cris/include/asm/mman.h b/arch/cris/include/asm/mman.h index b7f0afba3ce..8eebf89f5ab 100644 --- a/arch/cris/include/asm/mman.h +++ b/arch/cris/include/asm/mman.h @@ -1,19 +1 @@ -#ifndef __CRIS_MMAN_H__ -#define __CRIS_MMAN_H__ - -/* verbatim copy of asm-i386/ version */ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __CRIS_MMAN_H__ */ +#include <asm-generic/mman.h> diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 514f46a4b23..ff68b9f516a 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -54,7 +54,7 @@ mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, " "%dk init)\n" , - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_mapnr << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/frv/include/asm/mman.h b/arch/frv/include/asm/mman.h index 58c1d11e2ac..8eebf89f5ab 100644 --- a/arch/frv/include/asm/mman.h +++ b/arch/frv/include/asm/mman.h @@ -1,18 +1 @@ -#ifndef __ASM_MMAN_H__ -#define __ASM_MMAN_H__ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __ASM_MMAN_H__ */ - +#include <asm-generic/mman.h> diff --git a/arch/frv/lib/cache.S b/arch/frv/lib/cache.S index 0e10ad8dc46..0c4fb204911 100644 --- a/arch/frv/lib/cache.S +++ b/arch/frv/lib/cache.S @@ -1,4 +1,4 @@ -/* cache.S: cache managment routines +/* cache.S: cache management routines * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) diff --git a/arch/h8300/include/asm/hardirq.h b/arch/h8300/include/asm/hardirq.h index 9d7f7a7462b..c2e1aa0f0d1 100644 --- a/arch/h8300/include/asm/hardirq.h +++ b/arch/h8300/include/asm/hardirq.h @@ -1,18 +1,7 @@ #ifndef __H8300_HARDIRQ_H #define __H8300_HARDIRQ_H -#include <linux/kernel.h> -#include <linux/threads.h> -#include <linux/interrupt.h> -#include <linux/irq.h> - -typedef struct { - unsigned int __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ - -extern void ack_bad_irq(unsigned int irq); +#include <asm/irq.h> #define HARDIRQ_BITS 8 @@ -25,4 +14,6 @@ extern void ack_bad_irq(unsigned int irq); # error HARDIRQ_BITS is too low! #endif +#include <asm-generic/hardirq.h> + #endif diff --git a/arch/h8300/include/asm/mman.h b/arch/h8300/include/asm/mman.h index cf35f0a6f12..8eebf89f5ab 100644 --- a/arch/h8300/include/asm/mman.h +++ b/arch/h8300/include/asm/mman.h @@ -1,17 +1 @@ -#ifndef __H8300_MMAN_H__ -#define __H8300_MMAN_H__ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __H8300_MMAN_H__ */ +#include <asm-generic/mman.h> diff --git a/arch/h8300/kernel/irq.c b/arch/h8300/kernel/irq.c index 74f8dd7b34d..5c913d47211 100644 --- a/arch/h8300/kernel/irq.c +++ b/arch/h8300/kernel/irq.c @@ -81,11 +81,6 @@ struct irq_chip h8300irq_chip = { .end = h8300_end_irq, }; -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at vector %02x\n", irq); -} - #if defined(CONFIG_RAMKERNEL) static unsigned long __init *get_vector_address(void) { diff --git a/arch/h8300/kernel/timer/tpu.c b/arch/h8300/kernel/timer/tpu.c index e7c6e614a75..2193a2e2859 100644 --- a/arch/h8300/kernel/timer/tpu.c +++ b/arch/h8300/kernel/timer/tpu.c @@ -7,7 +7,6 @@ * */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 16ef61a91d9..625ed8f76fc 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -1270,7 +1270,7 @@ putreg (struct task_struct *child, int regno, unsigned int value) case PT_CS: if (value != __USER_CS) printk(KERN_ERR - "ia32.putreg: attempt to to set invalid segment register %d = %x\n", + "ia32.putreg: attempt to set invalid segment register %d = %x\n", regno, value); break; default: diff --git a/arch/ia64/include/asm/mman.h b/arch/ia64/include/asm/mman.h index 48cf8b98a0b..4459028e5aa 100644 --- a/arch/ia64/include/asm/mman.h +++ b/arch/ia64/include/asm/mman.h @@ -8,19 +8,9 @@ * David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co */ -#include <asm-generic/mman-common.h> +#include <asm-generic/mman.h> -#define MAP_GROWSDOWN 0x00100 /* stack-like segment */ -#define MAP_GROWSUP 0x00200 /* register stack-like segment */ -#define MAP_DENYWRITE 0x00800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x01000 /* mark it as an executable */ -#define MAP_LOCKED 0x02000 /* pages are locked */ -#define MAP_NORESERVE 0x04000 /* don't check for reservations */ -#define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#define MAP_GROWSUP 0x0200 /* register stack-like segment */ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b115b3bbf04..1d286244a56 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -655,7 +655,7 @@ mem_init (void) initsize = (unsigned long) __init_end - (unsigned long) __init_begin; printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, " - "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10), + "%luk data, %luk init)\n", nr_free_pages() << (PAGE_SHIFT - 10), num_physpages << (PAGE_SHIFT - 10), codesize >> 10, reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10); diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index cabba332cc4..c41234f1b82 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -41,6 +41,12 @@ config HZ int default 100 +config GENERIC_TIME + def_bool y + +config ARCH_USES_GETTIMEOFFSET + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/m32r/include/asm/hardirq.h b/arch/m32r/include/asm/hardirq.h index cb8aa762f23..4c31c0ae215 100644 --- a/arch/m32r/include/asm/hardirq.h +++ b/arch/m32r/include/asm/hardirq.h @@ -2,14 +2,7 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include <linux/threads.h> -#include <linux/irq.h> - -typedef struct { - unsigned int __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +#include <asm/irq.h> #if NR_IRQS > 256 #define HARDIRQ_BITS 9 @@ -26,11 +19,7 @@ typedef struct { # error HARDIRQ_BITS is too low! #endif -static inline void ack_bad_irq(int irq) -{ - printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq); - BUG(); -} +#include <asm-generic/hardirq.h> #endif /* __ASM_HARDIRQ_H */ #endif /* __KERNEL__ */ diff --git a/arch/m32r/include/asm/mman.h b/arch/m32r/include/asm/mman.h index 04a5f40aa40..8eebf89f5ab 100644 --- a/arch/m32r/include/asm/mman.h +++ b/arch/m32r/include/asm/mman.h @@ -1,17 +1 @@ -#ifndef __M32R_MMAN_H__ -#define __M32R_MMAN_H__ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __M32R_MMAN_H__ */ +#include <asm-generic/mman.h> diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index 98b8feb12ed..98682bba0ed 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -77,7 +77,7 @@ static int ptrace_read_user(struct task_struct *tsk, unsigned long off, struct user * dummy = NULL; #endif - if ((off & 3) || (off < 0) || (off > sizeof(struct user) - 3)) + if ((off & 3) || off > sizeof(struct user) - 3) return -EIO; off >>= 2; @@ -139,8 +139,7 @@ static int ptrace_write_user(struct task_struct *tsk, unsigned long off, struct user * dummy = NULL; #endif - if ((off & 3) || off < 0 || - off > sizeof(struct user) - 3) + if ((off & 3) || off > sizeof(struct user) - 3) return -EIO; off >>= 2; diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index 2547d6c4a82..655ea1c47a0 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -213,7 +213,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (!physid_isset(phys_id, phys_cpu_present_map)) continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount + 1)) + if (max_cpus <= cpucount + 1) continue; do_boot_cpu(phys_id); diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c index cada3ba4b99..ba61c4c7320 100644 --- a/arch/m32r/kernel/time.c +++ b/arch/m32r/kernel/time.c @@ -48,7 +48,7 @@ extern void smp_local_timer_interrupt(void); static unsigned long latch; -static unsigned long do_gettimeoffset(void) +u32 arch_gettimeoffset(void) { unsigned long elapsed_time = 0; /* [us] */ @@ -93,79 +93,10 @@ static unsigned long do_gettimeoffset(void) #error no chip configuration #endif - return elapsed_time; + return elapsed_time * 1000; } /* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick = tick_usec - tickadj; - - do { - seq = read_seqbegin(&xtime_lock); - - usec = do_gettimeoffset(); - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) - usec = min(usec, max_ntp_tick); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= do_gettimeoffset() * NSEC_PER_USEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -/* * In order to set the CMOS clock precisely, set_rtc_mmss has to be * called 500 ms after the second nowtime has started, because when * nowtime is written into the registers of the CMOS clock, it will @@ -192,6 +123,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) #ifndef CONFIG_SMP profile_tick(CPU_PROFILING); #endif + /* XXX FIXME. Uh, the xtime_lock should be held here, no? */ do_timer(1); #ifndef CONFIG_SMP diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index 24d429f9358..9f581df3952 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -171,7 +171,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " "%dk reserved, %dk data, %dk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index fb87c08c6b5..29dd8489ffe 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -58,6 +58,12 @@ config HZ int default 100 +config GENERIC_TIME + def_bool y + +config ARCH_USES_GETTIMEOFFSET + def_bool y + mainmenu "Linux/68k Kernel Configuration" source "init/Kconfig" diff --git a/arch/m68k/include/asm/hardirq_mm.h b/arch/m68k/include/asm/hardirq_mm.h index 394ee946015..554f65b6cd3 100644 --- a/arch/m68k/include/asm/hardirq_mm.h +++ b/arch/m68k/include/asm/hardirq_mm.h @@ -1,16 +1,8 @@ #ifndef __M68K_HARDIRQ_H #define __M68K_HARDIRQ_H -#include <linux/threads.h> -#include <linux/cache.h> - -/* entry.S is sensitive to the offsets of these fields */ -typedef struct { - unsigned int __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ - #define HARDIRQ_BITS 8 +#include <asm-generic/hardirq.h> + #endif diff --git a/arch/m68k/include/asm/mman.h b/arch/m68k/include/asm/mman.h index 9f5c4c4b3c7..8eebf89f5ab 100644 --- a/arch/m68k/include/asm/mman.h +++ b/arch/m68k/include/asm/mman.h @@ -1,17 +1 @@ -#ifndef __M68K_MMAN_H__ -#define __M68K_MMAN_H__ - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __M68K_MMAN_H__ */ +#include <asm-generic/mman.h> diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c index 54d980795fc..17dc2a31a7c 100644 --- a/arch/m68k/kernel/time.c +++ b/arch/m68k/kernel/time.c @@ -91,77 +91,11 @@ void __init time_init(void) mach_sched_init(timer_interrupt); } -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday(struct timeval *tv) +u32 arch_gettimeoffset(void) { - unsigned long flags; - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick = tick_usec - tickadj; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - - usec = mach_gettimeoffset(); - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) - usec = min(usec, max_ntp_tick); - - sec = xtime.tv_sec; - usec += xtime.tv_nsec/1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* This is revolting. We need to set the xtime.tv_nsec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ - nsec -= 1000 * mach_gettimeoffset(); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; + return mach_gettimeoffset() * 1000; } -EXPORT_SYMBOL(do_settimeofday); - - static int __init rtc_init(void) { struct platform_device *pdev; diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 0007b2adf3a..774549accd2 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -126,7 +126,7 @@ void __init mem_init(void) #endif printk("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), totalram_pages << (PAGE_SHIFT-10), codepages << (PAGE_SHIFT-10), datapages << (PAGE_SHIFT-10), diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index f207f1a94db..1110784eb3f 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -204,7 +204,7 @@ void __init mem_init(void) totalram_pages += free_all_bootmem(); printk(KERN_INFO "Memory: %luk/%luk available\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10)); #ifdef CONFIG_MMU mem_init_done = 1; diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h index e4d6f1fb1cf..a2250f390a2 100644 --- a/arch/mips/include/asm/mman.h +++ b/arch/mips/include/asm/mman.h @@ -46,6 +46,8 @@ #define MAP_LOCKED 0x8000 /* pages are locked */ #define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ +#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x80000 /* create a huge page mapping */ /* * Flags for msync @@ -71,6 +73,9 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ +#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 1a9f9b25755..d6eb6134abe 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -76,6 +76,16 @@ extern unsigned long zero_page_mask; #define ZERO_PAGE(vaddr) \ (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) +#define is_zero_pfn is_zero_pfn +static inline int is_zero_pfn(unsigned long pfn) +{ + extern unsigned long zero_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_pfn; + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); +} + +#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) + extern void paging_init(void); /* diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 38c79c55b06..1f4ee4797a6 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -417,7 +417,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), ram << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/mn10300/include/asm/cacheflush.h b/arch/mn10300/include/asm/cacheflush.h index 2db746a251f..1a55d61f0d0 100644 --- a/arch/mn10300/include/asm/cacheflush.h +++ b/arch/mn10300/include/asm/cacheflush.h @@ -17,7 +17,7 @@ #include <linux/mm.h> /* - * virtually-indexed cache managment (our cache is physically indexed) + * virtually-indexed cache management (our cache is physically indexed) */ #define flush_cache_all() do {} while (0) #define flush_cache_mm(mm) do {} while (0) @@ -31,7 +31,7 @@ #define flush_dcache_mmap_unlock(mapping) do {} while (0) /* - * physically-indexed cache managment + * physically-indexed cache management */ #ifndef CONFIG_MN10300_CACHE_DISABLED diff --git a/arch/mn10300/include/asm/mman.h b/arch/mn10300/include/asm/mman.h index d04fac1da5a..8eebf89f5ab 100644 --- a/arch/mn10300/include/asm/mman.h +++ b/arch/mn10300/include/asm/mman.h @@ -1,28 +1 @@ -/* MN10300 Constants for mmap and co. - * - * Copyright (C) 2007 Matsushita Electric Industrial Co., Ltd. - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * - Derived from asm-x86/mman.h - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#ifndef _ASM_MMAN_H -#define _ASM_MMAN_H - -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* _ASM_MMAN_H */ +#include <asm-generic/mman.h> diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c index 8cee387a24f..ec1420562dc 100644 --- a/arch/mn10300/mm/init.c +++ b/arch/mn10300/mm/init.c @@ -112,7 +112,7 @@ void __init mem_init(void) "Memory: %luk/%luk available" " (%dk kernel code, %dk reserved, %dk data, %dk init," " %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10), + nr_free_pages() << (PAGE_SHIFT - 10), max_mapnr << (PAGE_SHIFT - 10), codesize >> 10, reservedpages << (PAGE_SHIFT - 10), diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h index defe752cc99..9749c8afe83 100644 --- a/arch/parisc/include/asm/mman.h +++ b/arch/parisc/include/asm/mman.h @@ -22,6 +22,8 @@ #define MAP_GROWSDOWN 0x8000 /* stack-like segment */ #define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ +#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ @@ -54,6 +56,9 @@ #define MADV_16M_PAGES 24 /* Use 16 Megabyte pages */ #define MADV_64M_PAGES 26 /* Use 64 Megabyte pages */ +#define MADV_MERGEABLE 65 /* KSM may merge identical pages */ +#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index b0831d9e35c..d5aca31fddb 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -506,7 +506,7 @@ void __init mem_init(void) #endif printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 7b1c49811a2..d4a7f645c5d 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -25,6 +25,8 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #ifdef __KERNEL__ #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index acb74a17bbb..b4b167b3364 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -1,5 +1,5 @@ /* - * udbg for for NS16550 compatable serial ports + * udbg for NS16550 compatable serial ports * * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp * diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 579382c163a..0e5c59b995e 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -372,7 +372,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " "%luk reserved, %luk data, %luk bss, %luk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 24b30b6909c..fc1b1c42b1d 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -119,7 +119,7 @@ spufs_new_file(struct super_block *sb, struct dentry *dentry, const struct file_operations *fops, int mode, size_t size, struct spu_context *ctx) { - static struct inode_operations spufs_file_iops = { + static const struct inode_operations spufs_file_iops = { .setattr = spufs_setattr, }; struct inode *inode; @@ -773,7 +773,7 @@ static int spufs_fill_super(struct super_block *sb, void *data, int silent) { struct spufs_sb_info *info; - static struct super_operations s_ops = { + static const struct super_operations s_ops = { .alloc_inode = spufs_alloc_inode, .destroy_inode = spufs_destroy_inode, .statfs = simple_statfs, diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c index 572771fd846..9490157da62 100644 --- a/arch/powerpc/platforms/powermac/udbg_scc.c +++ b/arch/powerpc/platforms/powermac/udbg_scc.c @@ -1,5 +1,5 @@ /* - * udbg for for zilog scc ports as found on Apple PowerMacs + * udbg for zilog scc ports as found on Apple PowerMacs * * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp * diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index a4779912a5c..88f4ae78783 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -165,7 +165,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector, return 0; } -static struct block_device_operations axon_ram_devops = { +static const struct block_device_operations axon_ram_devops = { .owner = THIS_MODULE, .direct_access = axon_ram_direct_access }; diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index bd9914b8948..341aff2687a 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -41,7 +41,7 @@ struct hypfs_sb_info { static const struct file_operations hypfs_file_ops; static struct file_system_type hypfs_type; -static struct super_operations hypfs_s_ops; +static const struct super_operations hypfs_s_ops; /* start of list of all dentries, which have to be deleted on update */ static struct dentry *hypfs_last_dentry; @@ -472,7 +472,7 @@ static struct file_system_type hypfs_type = { .kill_sb = hypfs_kill_super }; -static struct super_operations hypfs_s_ops = { +static const struct super_operations hypfs_s_ops = { .statfs = simple_statfs, .drop_inode = hypfs_drop_inode, .show_options = hypfs_show_options, @@ -496,7 +496,7 @@ static int __init hypfs_init(void) } s390_kobj = kobject_create_and_add("s390", hypervisor_kobj); if (!s390_kobj) { - rc = -ENOMEM;; + rc = -ENOMEM; goto fail_sysfs; } rc = register_filesystem(&hypfs_type); diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h index f63fe7b431e..4e9c8ae0a63 100644 --- a/arch/s390/include/asm/mman.h +++ b/arch/s390/include/asm/mman.h @@ -9,18 +9,7 @@ #ifndef __S390_MMAN_H__ #define __S390_MMAN_H__ -#include <asm-generic/mman-common.h> - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#include <asm-generic/mman.h> #if defined(__KERNEL__) && !defined(__ASSEMBLY__) && defined(CONFIG_64BIT) int s390_mmap_check(unsigned long addr, unsigned long len); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2c2f9835341..43486c2408e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -478,7 +478,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) if (!inti) return -ENOMEM; - inti->type = KVM_S390_PROGRAM_INT;; + inti->type = KVM_S390_PROGRAM_INT; inti->pgm.code = code; VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index c634dfbe92e..76564795222 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -105,7 +105,7 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_mapnr << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index edc842ff61e..fabb7c6f48d 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -232,7 +232,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " "%dk data, %dk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, datasize >> 10, diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h index 988192e8e95..c3029ad6619 100644 --- a/arch/sparc/include/asm/mman.h +++ b/arch/sparc/include/asm/mman.h @@ -20,6 +20,8 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 8daab33fc17..8ab1d4728a4 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -229,7 +229,7 @@ static unsigned int sun4u_compute_tid(unsigned long imap, unsigned long cpuid) tid = ((a << IMAP_AID_SHIFT) | (n << IMAP_NID_SHIFT)); tid &= (IMAP_AID_SAFARI | - IMAP_NID_SAFARI);; + IMAP_NID_SAFARI); } } else { tid = cpuid << IMAP_TID_SHIFT; diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 54114ad0bde..dc7c3b17a15 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -472,7 +472,7 @@ void __init mem_init(void) reservedpages++; printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT - 10), codepages << (PAGE_SHIFT-10), reservedpages << (PAGE_SHIFT - 10), diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index f114813ae25..a74245ae3a8 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -533,7 +533,7 @@ static int eth_parse(char *str, int *index_out, char **str_out, char **error_out) { char *end; - int n, err = -EINVAL;; + int n, err = -EINVAL; n = simple_strtoul(str, &end, 0); if (end == str) { diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 8f05d4d9da1..635d16d90a8 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -106,7 +106,7 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); #define MAX_DEV (16) -static struct block_device_operations ubd_blops = { +static const struct block_device_operations ubd_blops = { .owner = THIS_MODULE, .open = ubd_open, .release = ubd_release, diff --git a/arch/um/include/asm/hardirq.h b/arch/um/include/asm/hardirq.h index 313ebb8a256..fb3c05a0cbb 100644 --- a/arch/um/include/asm/hardirq.h +++ b/arch/um/include/asm/hardirq.h @@ -1,25 +1 @@ -/* (c) 2004 cw@f00f.org, GPLv2 blah blah */ - -#ifndef __ASM_UM_HARDIRQ_H -#define __ASM_UM_HARDIRQ_H - -#include <linux/threads.h> -#include <linux/irq.h> - -/* NOTE: When SMP works again we might want to make this - * ____cacheline_aligned or maybe use per_cpu state? --cw */ -typedef struct { - unsigned int __softirq_pending; -} irq_cpustat_t; - -#include <linux/irq_cpustat.h> - -/* As this would be very strange for UML to get we BUG() after the - * printk. */ -static inline void ack_bad_irq(unsigned int irq) -{ - printk(KERN_ERR "unexpected IRQ %02x\n", irq); - BUG(); -} - -#endif /* __ASM_UM_HARDIRQ_H */ +#include <asm-generic/hardirq.h> diff --git a/arch/um/include/shared/ptrace_user.h b/arch/um/include/shared/ptrace_user.h index 4bce6e01288..7fd8539bc19 100644 --- a/arch/um/include/shared/ptrace_user.h +++ b/arch/um/include/shared/ptrace_user.h @@ -29,7 +29,7 @@ extern int ptrace_setregs(long pid, unsigned long *regs_in); * recompilation. So, we use PTRACE_OLDSETOPTIONS in UML. * We also want to be able to build the kernel on 2.4, which doesn't * have PTRACE_OLDSETOPTIONS. So, if it is missing, we declare - * PTRACE_OLDSETOPTIONS to to be the same as PTRACE_SETOPTIONS. + * PTRACE_OLDSETOPTIONS to be the same as PTRACE_SETOPTIONS. * * On architectures, that start to support PTRACE_O_TRACESYSGOOD on * linux 2.6, PTRACE_OLDSETOPTIONS never is defined, and also isn't diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 61d7e6138ff..a5d5e70cf6f 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -77,7 +77,7 @@ void __init mem_init(void) num_physpages = totalram_pages; max_pfn = totalram_pages; printk(KERN_INFO "Memory: %luk available\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10)); + nr_free_pages() << (PAGE_SHIFT-10)); kmalloc_ok = 1; #ifdef CONFIG_HIGHMEM diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 0cd9a7a05e7..8bfd1e90581 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -38,10 +38,10 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, *pte = pte_mkread(*pte); return 0; - out_pmd: - pud_free(mm, pud); out_pte: pmd_free(mm, pmd); + out_pmd: + pud_free(mm, pud); out: return -ENOMEM; } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f32fa71ccf9..c910a716a71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) * approved Athlon */ WARN_ONCE(1, "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); + " processors is not suitable for SMP.\n"); if (!test_taint(TAINT_UNSAFE_SMP)) add_taint(TAINT_UNSAFE_SMP); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a3210ce1ecc..85419bb7d4a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0db7969b0dd..378e9a8f1bf 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3cd7711bb94..b49b4f67453 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); @@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #endif if (!page_table) page_table = - (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); } else page_table = (pte_t *)alloc_low_page(); @@ -892,7 +892,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ea56b8cbb6a..810bd31e7f5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -687,7 +687,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_pfn << (PAGE_SHIFT-10), codesize >> 10, absent_pages << (PAGE_SHIFT-10), diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 528bf954eb7..8cc18334414 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - if (data->balance == 0) - return; - if (unlikely(data->balance != 1)) { kmemcheck_show_all(); kmemcheck_error_save_bug(regs); diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h index 9b92620c8a1..fca4db425f6 100644 --- a/arch/xtensa/include/asm/mman.h +++ b/arch/xtensa/include/asm/mman.h @@ -53,6 +53,8 @@ #define MAP_LOCKED 0x8000 /* pages are locked */ #define MAP_POPULATE 0x10000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ +#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x80000 /* create a huge page mapping */ /* * Flags for msync @@ -78,6 +80,9 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ +#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 427e14fa43c..cdbc27ca966 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -203,7 +203,7 @@ void __init mem_init(void) printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, " "%ldk data, %ldk init %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), ram << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c index 122c786449a..d0a7df2e5ca 100644 --- a/drivers/ata/pata_hpt37x.c +++ b/drivers/ata/pata_hpt37x.c @@ -624,7 +624,7 @@ static struct ata_port_operations hpt374_fn1_port_ops = { }; /** - * htp37x_clock_slot - Turn timing to PC clock entry + * hpt37x_clock_slot - Turn timing to PC clock entry * @freq: Reported frequency timing * @base: Base timing * diff --git a/drivers/base/node.c b/drivers/base/node.c index 91d4087b403..1fe5536d404 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -85,6 +85,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d FilePages: %8lu kB\n" "Node %d Mapped: %8lu kB\n" "Node %d AnonPages: %8lu kB\n" + "Node %d Shmem: %8lu kB\n" + "Node %d KernelStack: %8lu kB\n" "Node %d PageTables: %8lu kB\n" "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" @@ -116,6 +118,9 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(nid, NR_ANON_PAGES)), + nid, K(node_page_state(nid, NR_SHMEM)), + nid, node_page_state(nid, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, nid, K(node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), nid, K(node_page_state(nid, NR_BOUNCE)), diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 1e6b7c14f69..c77b6f3c28e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -152,7 +152,7 @@ static int DAC960_revalidate_disk(struct gendisk *disk) return 0; } -static struct block_device_operations DAC960_BlockDeviceOperations = { +static const struct block_device_operations DAC960_BlockDeviceOperations = { .owner = THIS_MODULE, .open = DAC960_open, .getgeo = DAC960_getgeo, @@ -6653,7 +6653,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, else ErrorCode = get_user(ControllerNumber, &UserSpaceControllerInfo->ControllerNumber); if (ErrorCode != 0) - break;; + break; ErrorCode = -ENXIO; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) { @@ -6661,7 +6661,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, } Controller = DAC960_Controllers[ControllerNumber]; if (Controller == NULL) - break;; + break; memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); ControllerInfo.ControllerNumber = ControllerNumber; ControllerInfo.FirmwareType = Controller->FirmwareType; @@ -7210,7 +7210,7 @@ static struct pci_driver DAC960_pci_driver = { .remove = DAC960_Remove, }; -static int DAC960_init_module(void) +static int __init DAC960_init_module(void) { int ret; @@ -7222,7 +7222,7 @@ static int DAC960_init_module(void) return ret; } -static void DAC960_cleanup_module(void) +static void __exit DAC960_cleanup_module(void) { int i; diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 2f07b7c99a9..05522583902 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1632,7 +1632,7 @@ static int amiga_floppy_change(struct gendisk *disk) return 0; } -static struct block_device_operations floppy_fops = { +static const struct block_device_operations floppy_fops = { .owner = THIS_MODULE, .open = floppy_open, .release = floppy_release, diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b6cd571adbf..3af97d4da2d 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -237,7 +237,7 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static struct block_device_operations aoe_bdops = { +static const struct block_device_operations aoe_bdops = { .open = aoeblk_open, .release = aoeblk_release, .getgeo = aoeblk_getgeo, diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 3ff02941b3d..847a9e57570 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1856,7 +1856,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) return 0; } -static struct block_device_operations floppy_fops = { +static const struct block_device_operations floppy_fops = { .owner = THIS_MODULE, .open = floppy_open, .release = floppy_release, diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 4bf8705b3ac..4f688434daf 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -375,7 +375,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode, return error; } -static struct block_device_operations brd_fops = { +static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .locked_ioctl = brd_ioctl, #ifdef CONFIG_BLK_DEV_XIP diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index d8372b43282..4f19105f755 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -205,7 +205,7 @@ static int cciss_compat_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); #endif -static struct block_device_operations cciss_fops = { +static const struct block_device_operations cciss_fops = { .owner = THIS_MODULE, .open = cciss_open, .release = cciss_release, diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 44fa2018f6b..b82d438e260 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -193,7 +193,7 @@ static inline ctlr_info_t *get_host(struct gendisk *disk) } -static struct block_device_operations ida_fops = { +static const struct block_device_operations ida_fops = { .owner = THIS_MODULE, .open = ida_open, .release = ida_release, diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 2b387c2260d..5c01f747571 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3907,7 +3907,7 @@ static int floppy_revalidate(struct gendisk *disk) return res; } -static struct block_device_operations floppy_fops = { +static const struct block_device_operations floppy_fops = { .owner = THIS_MODULE, .open = floppy_open, .release = floppy_release, diff --git a/drivers/block/hd.c b/drivers/block/hd.c index f9d01608cbe..d5cdce08ffd 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -692,7 +692,7 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static struct block_device_operations hd_fops = { +static const struct block_device_operations hd_fops = { .getgeo = hd_getgeo, }; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bbb79441d89..edda9ea7c62 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1438,7 +1438,7 @@ out_unlocked: return 0; } -static struct block_device_operations lo_fops = { +static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, .release = lo_release, diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 6d7fbaa9224..e0339aaa181 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -775,7 +775,7 @@ static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static struct block_device_operations mg_disk_ops = { +static const struct block_device_operations mg_disk_ops = { .getgeo = mg_getgeo }; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5d23ffad7c7..cc923a5b430 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -722,7 +722,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, return error; } -static struct block_device_operations nbd_fops = +static const struct block_device_operations nbd_fops = { .owner = THIS_MODULE, .locked_ioctl = nbd_ioctl, diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 13c1aee6aa3..a808b1530b3 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -125,7 +125,7 @@ static struct class *class_osdblk; /* /sys/class/osdblk */ static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ static LIST_HEAD(osdblkdev_list); -static struct block_device_operations osdblk_bd_ops = { +static const struct block_device_operations osdblk_bd_ops = { .owner = THIS_MODULE, }; diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 9f3518c515a..8866ca369d5 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -247,7 +247,7 @@ static int pcd_block_media_changed(struct gendisk *disk) return cdrom_media_changed(&cd->info); } -static struct block_device_operations pcd_bdops = { +static const struct block_device_operations pcd_bdops = { .owner = THIS_MODULE, .open = pcd_block_open, .release = pcd_block_release, diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index bf5955b3d87..569e39e8f11 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -807,7 +807,7 @@ static int pd_revalidate(struct gendisk *p) return 0; } -static struct block_device_operations pd_fops = { +static const struct block_device_operations pd_fops = { .owner = THIS_MODULE, .open = pd_open, .release = pd_release, diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 68a90834e99..ea54ea39355 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -262,7 +262,7 @@ static char *pf_buf; /* buffer for request in progress */ /* kernel glue structures */ -static struct block_device_operations pf_fops = { +static const struct block_device_operations pf_fops = { .owner = THIS_MODULE, .open = pf_open, .release = pf_release, diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index fd5bb8ad59a..2ddf03ae034 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2849,7 +2849,7 @@ static int pkt_media_changed(struct gendisk *disk) return attached_disk->fops->media_changed(attached_disk); } -static struct block_device_operations pktcdvd_ops = { +static const struct block_device_operations pktcdvd_ops = { .owner = THIS_MODULE, .open = pkt_open, .release = pkt_close, diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 34cbb7f3efa..03a130dca8a 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -82,7 +82,7 @@ enum lv1_ata_in_out { static int ps3disk_major; -static struct block_device_operations ps3disk_fops = { +static const struct block_device_operations ps3disk_fops = { .owner = THIS_MODULE, }; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index c8753a9ed29..3bb7c47c869 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -88,7 +88,7 @@ struct ps3vram_priv { static int ps3vram_major; -static struct block_device_operations ps3vram_fops = { +static const struct block_device_operations ps3vram_fops = { .owner = THIS_MODULE, }; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index cbfd9c0aef0..411f064760b 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -103,7 +103,7 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static struct block_device_operations vdc_fops = { +static const struct block_device_operations vdc_fops = { .owner = THIS_MODULE, .getgeo = vdc_getgeo, }; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index cf7877fb8a7..8f569e3df89 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -748,7 +748,7 @@ static int floppy_revalidate(struct gendisk *disk) return !fs->disk_in; } -static struct block_device_operations floppy_fops = { +static const struct block_device_operations floppy_fops = { .owner = THIS_MODULE, .open = floppy_open, .release = floppy_release, diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 80df93e3cdd..6380ad8d91b 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -998,7 +998,7 @@ static int floppy_revalidate(struct gendisk *disk) return ret; } -static struct block_device_operations floppy_fops = { +static const struct block_device_operations floppy_fops = { .open = floppy_open, .release = floppy_release, .locked_ioctl = floppy_ioctl, @@ -1062,7 +1062,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index) goto out_release; } fs->swim3_intr = macio_irq(mdev, 0); - fs->dma_intr = macio_irq(mdev, 1);; + fs->dma_intr = macio_irq(mdev, 1); fs->cur_cyl = -1; fs->cur_sector = -1; fs->secpercyl = 36; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index f5cd2e83ebc..a7c4184f4a6 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -423,7 +423,7 @@ static struct pci_driver carm_driver = { .remove = carm_remove_one, }; -static struct block_device_operations carm_bd_ops = { +static const struct block_device_operations carm_bd_ops = { .owner = THIS_MODULE, .getgeo = carm_bdev_getgeo, }; diff --git a/drivers/block/ub.c b/drivers/block/ub.c index cc54473b8e7..c739b203fe9 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -1789,7 +1789,7 @@ static int ub_bd_media_changed(struct gendisk *disk) return lun->changed; } -static struct block_device_operations ub_bd_fops = { +static const struct block_device_operations ub_bd_fops = { .owner = THIS_MODULE, .open = ub_bd_open, .release = ub_bd_release, diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 858c34dd032..ad1ba393801 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -140,7 +140,6 @@ struct cardinfo { }; static struct cardinfo cards[MM_MAXCARDS]; -static struct block_device_operations mm_fops; static struct timer_list battery_timer; static int num_cards; @@ -789,7 +788,7 @@ static int mm_check_change(struct gendisk *disk) return 0; } -static struct block_device_operations mm_fops = { +static const struct block_device_operations mm_fops = { .owner = THIS_MODULE, .getgeo = mm_getgeo, .revalidate_disk = mm_revalidate, diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index b441ce3832e..a8c8b56b275 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -219,7 +219,7 @@ static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* * Our file operations table */ -static struct block_device_operations viodasd_fops = { +static const struct block_device_operations viodasd_fops = { .owner = THIS_MODULE, .open = viodasd_open, .release = viodasd_release, diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index aa1a3d5a3e2..aa89fe45237 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -243,7 +243,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } -static struct block_device_operations virtblk_fops = { +static const struct block_device_operations virtblk_fops = { .locked_ioctl = virtblk_ioctl, .owner = THIS_MODULE, .getgeo = virtblk_getgeo, diff --git a/drivers/block/xd.c b/drivers/block/xd.c index ce242921992..0877d3628fd 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -130,7 +130,7 @@ static struct gendisk *xd_gendisk[2]; static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo); -static struct block_device_operations xd_fops = { +static const struct block_device_operations xd_fops = { .owner = THIS_MODULE, .locked_ioctl = xd_ioctl, .getgeo = xd_getgeo, diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index e53284767f7..b8578bb3f4c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -65,7 +65,7 @@ struct blk_shadow { unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; -static struct block_device_operations xlvbd_block_fops; +static const struct block_device_operations xlvbd_block_fops; #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) @@ -1039,7 +1039,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) return 0; } -static struct block_device_operations xlvbd_block_fops = +static const struct block_device_operations xlvbd_block_fops = { .owner = THIS_MODULE, .open = blkif_open, diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index b20abe102a2..e5c5415eb45 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -941,7 +941,7 @@ static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static struct block_device_operations ace_fops = { +static const struct block_device_operations ace_fops = { .owner = THIS_MODULE, .open = ace_open, .release = ace_release, diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index b2590409f25..64f941e0f14 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -64,7 +64,6 @@ static int current_device = -1; static DEFINE_SPINLOCK(z2ram_lock); -static struct block_device_operations z2_fops; static struct gendisk *z2ram_gendisk; static void do_z2_request(struct request_queue *q) @@ -315,7 +314,7 @@ z2_release(struct gendisk *disk, fmode_t mode) return 0; } -static struct block_device_operations z2_fops = +static const struct block_device_operations z2_fops = { .owner = THIS_MODULE, .open = z2_open, diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index b5621f27c4b..a762283d2a2 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -512,7 +512,7 @@ static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode, return cdrom_ioctl(gd.cd_info, bdev, mode, cmd, arg); } -static struct block_device_operations gdrom_bdops = { +static const struct block_device_operations gdrom_bdops = { .owner = THIS_MODULE, .open = gdrom_bdops_open, .release = gdrom_bdops_release, diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 0fff646cc2f..57ca69e0ac5 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -177,7 +177,7 @@ static int viocd_blk_media_changed(struct gendisk *disk) return cdrom_media_changed(&di->viocd_info); } -struct block_device_operations viocd_fops = { +static const struct block_device_operations viocd_fops = { .owner = THIS_MODULE, .open = viocd_blk_open, .release = viocd_blk_release, diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index ad87753f6de..a56ca080e10 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -114,9 +114,9 @@ static int agp_find_max(void) long memory, index, result; #if PAGE_SHIFT < 20 - memory = num_physpages >> (20 - PAGE_SHIFT); + memory = totalram_pages >> (20 - PAGE_SHIFT); #else - memory = num_physpages << (PAGE_SHIFT - 20); + memory = totalram_pages << (PAGE_SHIFT - 20); #endif index = 1; diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index 20ef1bf5e72..703959eba45 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -270,7 +270,7 @@ static void uninorth_agp_enable(struct agp_bridge_data *bridge, u32 mode) if ((uninorth_rev >= 0x30) && (uninorth_rev <= 0x33)) { /* - * We need to to set REQ_DEPTH to 7 for U3 versions 1.0, 2.1, + * We need to set REQ_DEPTH to 7 for U3 versions 1.0, 2.1, * 2.2 and 2.3, Darwin do so. */ if ((command >> AGPSTAT_RQ_DEPTH_SHIFT) > 7) diff --git a/drivers/char/epca.c b/drivers/char/epca.c index ff647ca1c48..9d589e3144d 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -2239,7 +2239,7 @@ static void do_softint(struct work_struct *work) struct channel *ch = container_of(work, struct channel, tqueue); /* Called in response to a modem change event */ if (ch && ch->magic == EPCA_MAGIC) { - struct tty_struct *tty = tty_port_tty_get(&ch->port);; + struct tty_struct *tty = tty_port_tty_get(&ch->port); if (tty && tty->driver_data) { if (test_and_clear_bit(EPCA_EVENT_HANGUP, &ch->event)) { diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c index a261bd735df..2e66b5f773d 100644 --- a/drivers/char/ipmi/ipmi_poweroff.c +++ b/drivers/char/ipmi/ipmi_poweroff.c @@ -691,7 +691,7 @@ static struct ctl_table_header *ipmi_table_header; /* * Startup and shutdown functions. */ -static int ipmi_poweroff_init(void) +static int __init ipmi_poweroff_init(void) { int rv; @@ -725,7 +725,7 @@ static int ipmi_poweroff_init(void) } #ifdef MODULE -static __exit void ipmi_poweroff_cleanup(void) +static void __exit ipmi_poweroff_cleanup(void) { int rv; diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c index 881934c068c..c250a31efa5 100644 --- a/drivers/char/pcmcia/cm4000_cs.c +++ b/drivers/char/pcmcia/cm4000_cs.c @@ -1017,7 +1017,7 @@ static ssize_t cmm_read(struct file *filp, __user char *buf, size_t count, } } - if (dev->proto == 0 && count > dev->rlen - dev->rpos) { + if (dev->proto == 0 && count > dev->rlen - dev->rpos && i) { DEBUGP(4, dev, "T=0 and count > buffer\n"); dev->rbuf[i] = dev->rbuf[i - 1]; dev->rbuf[i - 1] = dev->procbyte; diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index f1df59f59a3..68104434ebb 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -2,8 +2,12 @@ * menu.c - the menu idle governor * * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com> + * Copyright (C) 2009 Intel Corporation + * Author: + * Arjan van de Ven <arjan@linux.intel.com> * - * This code is licenced under the GPL. + * This code is licenced under the GPL version 2 as described + * in the COPYING file that acompanies the Linux Kernel. */ #include <linux/kernel.h> @@ -13,22 +17,158 @@ #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/tick.h> +#include <linux/sched.h> -#define BREAK_FUZZ 4 /* 4 us */ -#define PRED_HISTORY_PCT 50 +#define BUCKETS 12 +#define RESOLUTION 1024 +#define DECAY 4 +#define MAX_INTERESTING 50000 + +/* + * Concepts and ideas behind the menu governor + * + * For the menu governor, there are 3 decision factors for picking a C + * state: + * 1) Energy break even point + * 2) Performance impact + * 3) Latency tolerance (from pmqos infrastructure) + * These these three factors are treated independently. + * + * Energy break even point + * ----------------------- + * C state entry and exit have an energy cost, and a certain amount of time in + * the C state is required to actually break even on this cost. CPUIDLE + * provides us this duration in the "target_residency" field. So all that we + * need is a good prediction of how long we'll be idle. Like the traditional + * menu governor, we start with the actual known "next timer event" time. + * + * Since there are other source of wakeups (interrupts for example) than + * the next timer event, this estimation is rather optimistic. To get a + * more realistic estimate, a correction factor is applied to the estimate, + * that is based on historic behavior. For example, if in the past the actual + * duration always was 50% of the next timer tick, the correction factor will + * be 0.5. + * + * menu uses a running average for this correction factor, however it uses a + * set of factors, not just a single factor. This stems from the realization + * that the ratio is dependent on the order of magnitude of the expected + * duration; if we expect 500 milliseconds of idle time the likelihood of + * getting an interrupt very early is much higher than if we expect 50 micro + * seconds of idle time. A second independent factor that has big impact on + * the actual factor is if there is (disk) IO outstanding or not. + * (as a special twist, we consider every sleep longer than 50 milliseconds + * as perfect; there are no power gains for sleeping longer than this) + * + * For these two reasons we keep an array of 12 independent factors, that gets + * indexed based on the magnitude of the expected duration as well as the + * "is IO outstanding" property. + * + * Limiting Performance Impact + * --------------------------- + * C states, especially those with large exit latencies, can have a real + * noticable impact on workloads, which is not acceptable for most sysadmins, + * and in addition, less performance has a power price of its own. + * + * As a general rule of thumb, menu assumes that the following heuristic + * holds: + * The busier the system, the less impact of C states is acceptable + * + * This rule-of-thumb is implemented using a performance-multiplier: + * If the exit latency times the performance multiplier is longer than + * the predicted duration, the C state is not considered a candidate + * for selection due to a too high performance impact. So the higher + * this multiplier is, the longer we need to be idle to pick a deep C + * state, and thus the less likely a busy CPU will hit such a deep + * C state. + * + * Two factors are used in determing this multiplier: + * a value of 10 is added for each point of "per cpu load average" we have. + * a value of 5 points is added for each process that is waiting for + * IO on this CPU. + * (these values are experimentally determined) + * + * The load average factor gives a longer term (few seconds) input to the + * decision, while the iowait value gives a cpu local instantanious input. + * The iowait factor may look low, but realize that this is also already + * represented in the system load average. + * + */ struct menu_device { int last_state_idx; + int needs_update; unsigned int expected_us; - unsigned int predicted_us; - unsigned int current_predicted_us; - unsigned int last_measured_us; - unsigned int elapsed_us; + u64 predicted_us; + unsigned int measured_us; + unsigned int exit_us; + unsigned int bucket; + u64 correction_factor[BUCKETS]; }; + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static int get_loadavg(void) +{ + unsigned long this = this_cpu_load(); + + + return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10; +} + +static inline int which_bucket(unsigned int duration) +{ + int bucket = 0; + + /* + * We keep two groups of stats; one with no + * IO pending, one without. + * This allows us to calculate + * E(duration)|iowait + */ + if (nr_iowait_cpu()) + bucket = BUCKETS/2; + + if (duration < 10) + return bucket; + if (duration < 100) + return bucket + 1; + if (duration < 1000) + return bucket + 2; + if (duration < 10000) + return bucket + 3; + if (duration < 100000) + return bucket + 4; + return bucket + 5; +} + +/* + * Return a multiplier for the exit latency that is intended + * to take performance requirements into account. + * The more performance critical we estimate the system + * to be, the higher this multiplier, and thus the higher + * the barrier to go to an expensive C state. + */ +static inline int performance_multiplier(void) +{ + int mult = 1; + + /* for higher loadavg, we are more reluctant */ + + mult += 2 * get_loadavg(); + + /* for IO wait tasks (per cpu!) we add 5x each */ + mult += 10 * nr_iowait_cpu(); + + return mult; +} + static DEFINE_PER_CPU(struct menu_device, menu_devices); +static void menu_update(struct cpuidle_device *dev); + /** * menu_select - selects the next idle state to enter * @dev: the CPU @@ -38,41 +178,68 @@ static int menu_select(struct cpuidle_device *dev) struct menu_device *data = &__get_cpu_var(menu_devices); int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY); int i; + int multiplier; + + data->last_state_idx = 0; + data->exit_us = 0; + + if (data->needs_update) { + menu_update(dev); + data->needs_update = 0; + } /* Special case when user has set very strict latency requirement */ - if (unlikely(latency_req == 0)) { - data->last_state_idx = 0; + if (unlikely(latency_req == 0)) return 0; - } - /* determine the expected residency time */ + /* determine the expected residency time, round up */ data->expected_us = - (u32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000; + DIV_ROUND_UP((u32)ktime_to_ns(tick_nohz_get_sleep_length()), 1000); + + + data->bucket = which_bucket(data->expected_us); + + multiplier = performance_multiplier(); + + /* + * if the correction factor is 0 (eg first time init or cpu hotplug + * etc), we actually want to start out with a unity factor. + */ + if (data->correction_factor[data->bucket] == 0) + data->correction_factor[data->bucket] = RESOLUTION * DECAY; + + /* Make sure to round up for half microseconds */ + data->predicted_us = DIV_ROUND_CLOSEST( + data->expected_us * data->correction_factor[data->bucket], + RESOLUTION * DECAY); + + /* + * We want to default to C1 (hlt), not to busy polling + * unless the timer is happening really really soon. + */ + if (data->expected_us > 5) + data->last_state_idx = CPUIDLE_DRIVER_STATE_START; - /* Recalculate predicted_us based on prediction_history_pct */ - data->predicted_us *= PRED_HISTORY_PCT; - data->predicted_us += (100 - PRED_HISTORY_PCT) * - data->current_predicted_us; - data->predicted_us /= 100; /* find the deepest idle state that satisfies our constraints */ - for (i = CPUIDLE_DRIVER_STATE_START + 1; i < dev->state_count; i++) { + for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) { struct cpuidle_state *s = &dev->states[i]; - if (s->target_residency > data->expected_us) - break; if (s->target_residency > data->predicted_us) break; if (s->exit_latency > latency_req) break; + if (s->exit_latency * multiplier > data->predicted_us) + break; + data->exit_us = s->exit_latency; + data->last_state_idx = i; } - data->last_state_idx = i - 1; - return i - 1; + return data->last_state_idx; } /** - * menu_reflect - attempts to guess what happened after entry + * menu_reflect - records that data structures need update * @dev: the CPU * * NOTE: it's important to be fast here because this operation will add to @@ -81,39 +248,63 @@ static int menu_select(struct cpuidle_device *dev) static void menu_reflect(struct cpuidle_device *dev) { struct menu_device *data = &__get_cpu_var(menu_devices); + data->needs_update = 1; +} + +/** + * menu_update - attempts to guess what happened after entry + * @dev: the CPU + */ +static void menu_update(struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); int last_idx = data->last_state_idx; unsigned int last_idle_us = cpuidle_get_last_residency(dev); struct cpuidle_state *target = &dev->states[last_idx]; unsigned int measured_us; + u64 new_factor; /* * Ugh, this idle state doesn't support residency measurements, so we * are basically lost in the dark. As a compromise, assume we slept - * for one full standard timer tick. However, be aware that this - * could potentially result in a suboptimal state transition. + * for the whole expected time. */ if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID))) - last_idle_us = USEC_PER_SEC / HZ; + last_idle_us = data->expected_us; + + + measured_us = last_idle_us; /* - * measured_us and elapsed_us are the cumulative idle time, since the - * last time we were woken out of idle by an interrupt. + * We correct for the exit latency; we are assuming here that the + * exit latency happens after the event that we're interested in. */ - if (data->elapsed_us <= data->elapsed_us + last_idle_us) - measured_us = data->elapsed_us + last_idle_us; + if (measured_us > data->exit_us) + measured_us -= data->exit_us; + + + /* update our correction ratio */ + + new_factor = data->correction_factor[data->bucket] + * (DECAY - 1) / DECAY; + + if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING) + new_factor += RESOLUTION * measured_us / data->expected_us; else - measured_us = -1; + /* + * we were idle so long that we count it as a perfect + * prediction + */ + new_factor += RESOLUTION; - /* Predict time until next break event */ - data->current_predicted_us = max(measured_us, data->last_measured_us); + /* + * We don't want 0 as factor; we always want at least + * a tiny bit of estimated time. + */ + if (new_factor == 0) + new_factor = 1; - if (last_idle_us + BREAK_FUZZ < - data->expected_us - target->exit_latency) { - data->last_measured_us = measured_us; - data->elapsed_us = 0; - } else { - data->elapsed_us = measured_us; - } + data->correction_factor[data->bucket] = new_factor; } /** diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index 871c13b4c14..12f355cafdb 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -286,7 +286,7 @@ enum scrub_type { * is irrespective of the memory devices being mounted * on both sides of the memory stick. * - * Socket set: All of the memory sticks that are required for for + * Socket set: All of the memory sticks that are required for * a single memory access or all of the memory sticks * spanned by a chip-select row. A single socket set * has two chip-select rows and if double-sided sticks diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index d5ea8a68d33..56f9234781f 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -164,7 +164,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = alloc_bootmem_low(sizeof(struct firmware_map_entry)); + entry = alloc_bootmem(sizeof(struct firmware_map_entry)); if (WARN_ON(!entry)) return -ENOMEM; diff --git a/drivers/gpio/bt8xxgpio.c b/drivers/gpio/bt8xxgpio.c index 984b587f0f9..55904140213 100644 --- a/drivers/gpio/bt8xxgpio.c +++ b/drivers/gpio/bt8xxgpio.c @@ -331,13 +331,13 @@ static struct pci_driver bt8xxgpio_pci_driver = { .resume = bt8xxgpio_resume, }; -static int bt8xxgpio_init(void) +static int __init bt8xxgpio_init(void) { return pci_register_driver(&bt8xxgpio_pci_driver); } module_init(bt8xxgpio_init) -static void bt8xxgpio_exit(void) +static void __exit bt8xxgpio_exit(void) { pci_unregister_driver(&bt8xxgpio_pci_driver); } diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 2b914d73207..f4856a51047 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -232,7 +232,7 @@ intel_dp_aux_ch(struct intel_output *intel_output, for (try = 0; try < 5; try++) { /* Load the send data into the aux channel data registers */ for (i = 0; i < send_bytes; i += 4) { - uint32_t d = pack_aux(send + i, send_bytes - i);; + uint32_t d = pack_aux(send + i, send_bytes - i); I915_WRITE(ch_data + i, d); } diff --git a/drivers/gpu/drm/mga/mga_state.c b/drivers/gpu/drm/mga/mga_state.c index b710fab21cb..a53b848e0f1 100644 --- a/drivers/gpu/drm/mga/mga_state.c +++ b/drivers/gpu/drm/mga/mga_state.c @@ -239,7 +239,7 @@ static __inline__ void mga_g200_emit_pipe(drm_mga_private_t * dev_priv) MGA_WR34, 0x00000000, MGA_WR42, 0x0000ffff, MGA_WR60, 0x0000ffff); - /* Padding required to to hardware bug. + /* Padding required due to hardware bug. */ DMA_BLOCK(MGA_DMAPAD, 0xffffffff, MGA_DMAPAD, 0xffffffff, @@ -317,7 +317,7 @@ static __inline__ void mga_g400_emit_pipe(drm_mga_private_t * dev_priv) MGA_WR52, MGA_G400_WR_MAGIC, /* tex1 width */ MGA_WR60, MGA_G400_WR_MAGIC); /* tex1 height */ - /* Padding required to to hardware bug */ + /* Padding required due to hardware bug */ DMA_BLOCK(MGA_DMAPAD, 0xffffffff, MGA_DMAPAD, 0xffffffff, MGA_DMAPAD, 0xffffffff, diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 111afbe8de0..24d90ea246c 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -205,13 +205,6 @@ config HID_NTRIG Support for N-Trig touch screen. config HID_PANTHERLORD - tristate "Pantherlord devices support" if EMBEDDED - depends on USB_HID - default !EMBEDDED - ---help--- - Support for PantherLord/GreenAsia based device support. - -config HID_PANTHERLORD tristate "Pantherlord support" if EMBEDDED depends on USB_HID default !EMBEDDED diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 342b7d36d7b..be34d32906b 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1089,8 +1089,7 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i return -1; } - buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE, - interrupt ? GFP_ATOMIC : GFP_KERNEL); + buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC); if (!buf) { report = hid_get_report(report_enum, data); @@ -1238,6 +1237,17 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask) } EXPORT_SYMBOL_GPL(hid_connect); +void hid_disconnect(struct hid_device *hdev) +{ + if (hdev->claimed & HID_CLAIMED_INPUT) + hidinput_disconnect(hdev); + if (hdev->claimed & HID_CLAIMED_HIDDEV) + hdev->hiddev_disconnect(hdev); + if (hdev->claimed & HID_CLAIMED_HIDRAW) + hidraw_disconnect(hdev); +} +EXPORT_SYMBOL_GPL(hid_disconnect); + /* a list of devices for which there is a specialized driver on HID bus */ static const struct hid_device_id hid_blacklist[] = { { HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_WCP32PU) }, diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 1b0e07a67d6..03bd703255a 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -1041,13 +1041,6 @@ static void usbhid_stop(struct hid_device *hid) hid_cancel_delayed_stuff(usbhid); - if (hid->claimed & HID_CLAIMED_INPUT) - hidinput_disconnect(hid); - if (hid->claimed & HID_CLAIMED_HIDDEV) - hiddev_disconnect(hid); - if (hid->claimed & HID_CLAIMED_HIDRAW) - hidraw_disconnect(hid); - hid->claimed = 0; usb_free_urb(usbhid->urbin); @@ -1085,7 +1078,7 @@ static struct hid_ll_driver usb_hid_driver = { .hidinput_input_event = usb_hidinput_input_event, }; -static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id) +static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_host_interface *interface = intf->cur_altsetting; struct usb_device *dev = interface_to_usbdev(intf); @@ -1117,6 +1110,7 @@ static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id) hid->ff_init = hid_pidff_init; #ifdef CONFIG_USB_HIDDEV hid->hiddev_connect = hiddev_connect; + hid->hiddev_disconnect = hiddev_disconnect; hid->hiddev_hid_event = hiddev_hid_event; hid->hiddev_report_event = hiddev_report_event; #endif @@ -1177,7 +1171,7 @@ err: return ret; } -static void hid_disconnect(struct usb_interface *intf) +static void usbhid_disconnect(struct usb_interface *intf) { struct hid_device *hid = usb_get_intfdata(intf); struct usbhid_device *usbhid; @@ -1359,8 +1353,8 @@ MODULE_DEVICE_TABLE (usb, hid_usb_ids); static struct usb_driver hid_driver = { .name = "usbhid", - .probe = hid_probe, - .disconnect = hid_disconnect, + .probe = usbhid_probe, + .disconnect = usbhid_disconnect, #ifdef CONFIG_PM .suspend = hid_suspend, .resume = hid_resume, diff --git a/drivers/hwmon/adm1021.c b/drivers/hwmon/adm1021.c index b11e06f644b..afc59431812 100644 --- a/drivers/hwmon/adm1021.c +++ b/drivers/hwmon/adm1021.c @@ -83,16 +83,14 @@ struct adm1021_data { struct mutex update_lock; char valid; /* !=0 if following fields are valid */ + char low_power; /* !=0 if device in low power mode */ unsigned long last_updated; /* In jiffies */ - s8 temp_max[2]; /* Register values */ - s8 temp_min[2]; - s8 temp[2]; + int temp_max[2]; /* Register values */ + int temp_min[2]; + int temp[2]; u8 alarms; /* Special values for ADM1023 only */ - u8 remote_temp_prec; - u8 remote_temp_os_prec; - u8 remote_temp_hyst_prec; u8 remote_temp_offset; u8 remote_temp_offset_prec; }; @@ -141,7 +139,7 @@ static ssize_t show_temp(struct device *dev, int index = to_sensor_dev_attr(devattr)->index; struct adm1021_data *data = adm1021_update_device(dev); - return sprintf(buf, "%d\n", 1000 * data->temp[index]); + return sprintf(buf, "%d\n", data->temp[index]); } static ssize_t show_temp_max(struct device *dev, @@ -150,7 +148,7 @@ static ssize_t show_temp_max(struct device *dev, int index = to_sensor_dev_attr(devattr)->index; struct adm1021_data *data = adm1021_update_device(dev); - return sprintf(buf, "%d\n", 1000 * data->temp_max[index]); + return sprintf(buf, "%d\n", data->temp_max[index]); } static ssize_t show_temp_min(struct device *dev, @@ -159,7 +157,7 @@ static ssize_t show_temp_min(struct device *dev, int index = to_sensor_dev_attr(devattr)->index; struct adm1021_data *data = adm1021_update_device(dev); - return sprintf(buf, "%d\n", 1000 * data->temp_min[index]); + return sprintf(buf, "%d\n", data->temp_min[index]); } static ssize_t show_alarm(struct device *dev, struct device_attribute *attr, @@ -216,6 +214,35 @@ static ssize_t set_temp_min(struct device *dev, return count; } +static ssize_t show_low_power(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct adm1021_data *data = adm1021_update_device(dev); + return sprintf(buf, "%d\n", data->low_power); +} + +static ssize_t set_low_power(struct device *dev, + struct device_attribute *devattr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct adm1021_data *data = i2c_get_clientdata(client); + int low_power = simple_strtol(buf, NULL, 10) != 0; + + mutex_lock(&data->update_lock); + if (low_power != data->low_power) { + int config = i2c_smbus_read_byte_data( + client, ADM1021_REG_CONFIG_R); + data->low_power = low_power; + i2c_smbus_write_byte_data(client, ADM1021_REG_CONFIG_W, + (config & 0xBF) | (low_power << 6)); + } + mutex_unlock(&data->update_lock); + + return count; +} + + static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL, 0); static SENSOR_DEVICE_ATTR(temp1_max, S_IWUSR | S_IRUGO, show_temp_max, set_temp_max, 0); @@ -233,6 +260,7 @@ static SENSOR_DEVICE_ATTR(temp2_min_alarm, S_IRUGO, show_alarm, NULL, 3); static SENSOR_DEVICE_ATTR(temp2_fault, S_IRUGO, show_alarm, NULL, 2); static DEVICE_ATTR(alarms, S_IRUGO, show_alarms, NULL); +static DEVICE_ATTR(low_power, S_IWUSR | S_IRUGO, show_low_power, set_low_power); static struct attribute *adm1021_attributes[] = { &sensor_dev_attr_temp1_max.dev_attr.attr, @@ -247,6 +275,7 @@ static struct attribute *adm1021_attributes[] = { &sensor_dev_attr_temp2_min_alarm.dev_attr.attr, &sensor_dev_attr_temp2_fault.dev_attr.attr, &dev_attr_alarms.attr, + &dev_attr_low_power.attr, NULL }; @@ -412,25 +441,27 @@ static struct adm1021_data *adm1021_update_device(struct device *dev) dev_dbg(&client->dev, "Starting adm1021 update\n"); for (i = 0; i < 2; i++) { - data->temp[i] = i2c_smbus_read_byte_data(client, - ADM1021_REG_TEMP(i)); - data->temp_max[i] = i2c_smbus_read_byte_data(client, - ADM1021_REG_TOS_R(i)); - data->temp_min[i] = i2c_smbus_read_byte_data(client, - ADM1021_REG_THYST_R(i)); + data->temp[i] = 1000 * + (s8) i2c_smbus_read_byte_data( + client, ADM1021_REG_TEMP(i)); + data->temp_max[i] = 1000 * + (s8) i2c_smbus_read_byte_data( + client, ADM1021_REG_TOS_R(i)); + data->temp_min[i] = 1000 * + (s8) i2c_smbus_read_byte_data( + client, ADM1021_REG_THYST_R(i)); } data->alarms = i2c_smbus_read_byte_data(client, ADM1021_REG_STATUS) & 0x7c; if (data->type == adm1023) { - data->remote_temp_prec = - i2c_smbus_read_byte_data(client, - ADM1023_REG_REM_TEMP_PREC); - data->remote_temp_os_prec = - i2c_smbus_read_byte_data(client, - ADM1023_REG_REM_TOS_PREC); - data->remote_temp_hyst_prec = - i2c_smbus_read_byte_data(client, - ADM1023_REG_REM_THYST_PREC); + /* The ADM1023 provides 3 extra bits of precision for + * the remote sensor in extra registers. */ + data->temp[1] += 125 * (i2c_smbus_read_byte_data( + client, ADM1023_REG_REM_TEMP_PREC) >> 5); + data->temp_max[1] += 125 * (i2c_smbus_read_byte_data( + client, ADM1023_REG_REM_TOS_PREC) >> 5); + data->temp_min[1] += 125 * (i2c_smbus_read_byte_data( + client, ADM1023_REG_REM_THYST_PREC) >> 5); data->remote_temp_offset = i2c_smbus_read_byte_data(client, ADM1023_REG_REM_OFFSET); diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index 753b34885f9..7ea6a8f6605 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -178,6 +178,8 @@ static const int debug; static struct platform_device *pdev; static s16 rest_x; static s16 rest_y; +static u8 backlight_state[2]; + static struct device *hwmon_dev; static struct input_polled_dev *applesmc_idev; @@ -497,17 +499,36 @@ static int applesmc_probe(struct platform_device *dev) return 0; } -static int applesmc_resume(struct platform_device *dev) +/* Synchronize device with memorized backlight state */ +static int applesmc_pm_resume(struct device *dev) { - return applesmc_device_init(); + mutex_lock(&applesmc_lock); + if (applesmc_light) + applesmc_write_key(BACKLIGHT_KEY, backlight_state, 2); + mutex_unlock(&applesmc_lock); + return 0; } +/* Reinitialize device on resume from hibernation */ +static int applesmc_pm_restore(struct device *dev) +{ + int ret = applesmc_device_init(); + if (ret) + return ret; + return applesmc_pm_resume(dev); +} + +static struct dev_pm_ops applesmc_pm_ops = { + .resume = applesmc_pm_resume, + .restore = applesmc_pm_restore, +}; + static struct platform_driver applesmc_driver = { .probe = applesmc_probe, - .resume = applesmc_resume, .driver = { .name = "applesmc", .owner = THIS_MODULE, + .pm = &applesmc_pm_ops, }, }; @@ -804,17 +825,10 @@ static ssize_t applesmc_calibrate_store(struct device *dev, return count; } -/* Store the next backlight value to be written by the work */ -static unsigned int backlight_value; - static void applesmc_backlight_set(struct work_struct *work) { - u8 buffer[2]; - mutex_lock(&applesmc_lock); - buffer[0] = backlight_value; - buffer[1] = 0x00; - applesmc_write_key(BACKLIGHT_KEY, buffer, 2); + applesmc_write_key(BACKLIGHT_KEY, backlight_state, 2); mutex_unlock(&applesmc_lock); } static DECLARE_WORK(backlight_work, &applesmc_backlight_set); @@ -824,7 +838,7 @@ static void applesmc_brightness_set(struct led_classdev *led_cdev, { int ret; - backlight_value = value; + backlight_state[0] = value; ret = queue_work(applesmc_led_wq, &backlight_work); if (debug && (!ret)) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 93c17223b52..972cf4ba963 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -185,7 +185,7 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device * } } - if (ismobile) { + if (ismobile || c->x86_model == 0x1c) { err = rdmsr_safe_on_cpu(id, 0xee, &eax, &edx); if (err) { @@ -417,7 +417,7 @@ static int __init coretemp_init(void) if ((c->cpuid_level < 0) || (c->x86 != 0x6) || !((c->x86_model == 0xe) || (c->x86_model == 0xf) || (c->x86_model == 0x16) || (c->x86_model == 0x17) || - (c->x86_model == 0x1A))) { + (c->x86_model == 0x1A) || (c->x86_model == 0x1c))) { /* supported CPU not found, but report the unknown family 6 CPU */ diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index 271338bdb6b..cf5afb9a10a 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -454,6 +454,15 @@ int lis3lv02d_init_device(struct lis3lv02d *dev) (p->click_thresh_y << 4)); } + if (p->wakeup_flags && (dev->whoami == LIS_SINGLE_ID)) { + dev->write(dev, FF_WU_CFG_1, p->wakeup_flags); + dev->write(dev, FF_WU_THS_1, p->wakeup_thresh & 0x7f); + /* default to 2.5ms for now */ + dev->write(dev, FF_WU_DURATION_1, 1); + /* enable high pass filter for both free-fall units */ + dev->write(dev, CTRL_REG2, HP_FF_WU1 | HP_FF_WU2); + } + if (p->irq_cfg) dev->write(dev, CTRL_REG3, p->irq_cfg); } diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h index e320e2f511f..3e1ff46f72d 100644 --- a/drivers/hwmon/lis3lv02d.h +++ b/drivers/hwmon/lis3lv02d.h @@ -58,15 +58,17 @@ enum lis3_reg { OUTZ_L = 0x2C, OUTZ_H = 0x2D, OUTZ = 0x2D, - FF_WU_CFG = 0x30, - FF_WU_SRC = 0x31, - FF_WU_ACK = 0x32, - FF_WU_THS_L = 0x34, - FF_WU_THS_H = 0x35, - FF_WU_DURATION = 0x36, }; enum lis302d_reg { + FF_WU_CFG_1 = 0x30, + FF_WU_SRC_1 = 0x31, + FF_WU_THS_1 = 0x32, + FF_WU_DURATION_1 = 0x33, + FF_WU_CFG_2 = 0x34, + FF_WU_SRC_2 = 0x35, + FF_WU_THS_2 = 0x36, + FF_WU_DURATION_2 = 0x37, CLICK_CFG = 0x38, CLICK_SRC = 0x39, CLICK_THSY_X = 0x3B, @@ -77,6 +79,12 @@ enum lis302d_reg { }; enum lis3lv02d_reg { + FF_WU_CFG = 0x30, + FF_WU_SRC = 0x31, + FF_WU_ACK = 0x32, + FF_WU_THS_L = 0x34, + FF_WU_THS_H = 0x35, + FF_WU_DURATION = 0x36, DD_CFG = 0x38, DD_SRC = 0x39, DD_ACK = 0x3A, @@ -107,6 +115,10 @@ enum lis3lv02d_ctrl2 { CTRL2_FS = 0x80, /* Full Scale selection */ }; +enum lis302d_ctrl2 { + HP_FF_WU2 = 0x08, + HP_FF_WU1 = 0x04, +}; enum lis3lv02d_ctrl3 { CTRL3_CFS0 = 0x01, diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c index 3827ff04485..82ebca5a699 100644 --- a/drivers/hwmon/lis3lv02d_spi.c +++ b/drivers/hwmon/lis3lv02d_spi.c @@ -66,17 +66,16 @@ static int __devinit lis302dl_spi_probe(struct spi_device *spi) if (ret < 0) return ret; - lis3_dev.bus_priv = spi; - lis3_dev.init = lis3_spi_init; - lis3_dev.read = lis3_spi_read; - lis3_dev.write = lis3_spi_write; - lis3_dev.irq = spi->irq; - lis3_dev.ac = lis3lv02d_axis_normal; - lis3_dev.pdata = spi->dev.platform_data; + lis3_dev.bus_priv = spi; + lis3_dev.init = lis3_spi_init; + lis3_dev.read = lis3_spi_read; + lis3_dev.write = lis3_spi_write; + lis3_dev.irq = spi->irq; + lis3_dev.ac = lis3lv02d_axis_normal; + lis3_dev.pdata = spi->dev.platform_data; spi_set_drvdata(spi, &lis3_dev); - ret = lis3lv02d_init_device(&lis3_dev); - return ret; + return lis3lv02d_init_device(&lis3_dev); } static int __devexit lis302dl_spi_remove(struct spi_device *spi) @@ -87,6 +86,32 @@ static int __devexit lis302dl_spi_remove(struct spi_device *spi) return 0; } +#ifdef CONFIG_PM +static int lis3lv02d_spi_suspend(struct spi_device *spi, pm_message_t mesg) +{ + struct lis3lv02d *lis3 = spi_get_drvdata(spi); + + if (!lis3->pdata->wakeup_flags) + lis3lv02d_poweroff(&lis3_dev); + + return 0; +} + +static int lis3lv02d_spi_resume(struct spi_device *spi) +{ + struct lis3lv02d *lis3 = spi_get_drvdata(spi); + + if (!lis3->pdata->wakeup_flags) + lis3lv02d_poweron(lis3); + + return 0; +} + +#else +#define lis3lv02d_spi_suspend NULL +#define lis3lv02d_spi_resume NULL +#endif + static struct spi_driver lis302dl_spi_driver = { .driver = { .name = DRV_NAME, @@ -94,6 +119,8 @@ static struct spi_driver lis302dl_spi_driver = { }, .probe = lis302dl_spi_probe, .remove = __devexit_p(lis302dl_spi_remove), + .suspend = lis3lv02d_spi_suspend, + .resume = lis3lv02d_spi_resume, }; static int __init lis302dl_init(void) diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 6290a259456..303c02694c3 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -562,7 +562,7 @@ static int __devinit sht15_probe(struct platform_device *pdev) ret = sysfs_create_group(&pdev->dev.kobj, &sht15_attr_group); if (ret) { dev_err(&pdev->dev, "sysfs create failed"); - goto err_free_data; + goto err_release_gpio_data; } ret = request_irq(gpio_to_irq(data->pdata->gpio_data), @@ -581,10 +581,12 @@ static int __devinit sht15_probe(struct platform_device *pdev) data->hwmon_dev = hwmon_device_register(data->dev); if (IS_ERR(data->hwmon_dev)) { ret = PTR_ERR(data->hwmon_dev); - goto err_release_gpio_data; + goto err_release_irq; } return 0; +err_release_irq: + free_irq(gpio_to_irq(data->pdata->gpio_data), data); err_release_gpio_data: gpio_free(data->pdata->gpio_data); err_release_gpio_sck: diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index b79ca419d8d..64207df8da8 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1686,7 +1686,7 @@ static int idecd_revalidate_disk(struct gendisk *disk) return 0; } -static struct block_device_operations idecd_ops = { +static const struct block_device_operations idecd_ops = { .owner = THIS_MODULE, .open = idecd_open, .release = idecd_release, diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index 214119026b3..753241429c2 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c @@ -321,7 +321,7 @@ static int ide_gd_ioctl(struct block_device *bdev, fmode_t mode, return drive->disk_ops->ioctl(drive, bdev, mode, cmd, arg); } -static struct block_device_operations ide_gd_ops = { +static const struct block_device_operations ide_gd_ops = { .owner = THIS_MODULE, .open = ide_gd_open, .release = ide_gd_release, diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 8de442cbee9..63c53d65e87 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1212,7 +1212,7 @@ static int ide_find_port_slot(const struct ide_port_info *d) { int idx = -ENOENT; u8 bootable = (d && (d->host_flags & IDE_HFLAG_NON_BOOTABLE)) ? 0 : 1; - u8 i = (d && (d->host_flags & IDE_HFLAG_QD_2ND_PORT)) ? 1 : 0;; + u8 i = (d && (d->host_flags & IDE_HFLAG_QD_2ND_PORT)) ? 1 : 0; /* * Claim an unassigned slot. diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 9d6f62baac2..58fc920d5c3 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1913,7 +1913,7 @@ static int idetape_ioctl(struct block_device *bdev, fmode_t mode, return err; } -static struct block_device_operations idetape_block_ops = { +static const struct block_device_operations idetape_block_ops = { .owner = THIS_MODULE, .open = idetape_open, .release = idetape_release, diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c index 0608d41fb6d..60f936e2319 100644 --- a/drivers/ide/umc8672.c +++ b/drivers/ide/umc8672.c @@ -170,9 +170,9 @@ static int __init umc8672_init(void) goto out; if (umc8672_probe() == 0) - return 0;; + return 0; out: - return -ENODEV;; + return -ENODEV; } module_init(umc8672_init); diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index 02831ad070b..4bd39c8af80 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -809,7 +809,7 @@ static int ipath_setup_ht_reset(struct ipath_devdata *dd) * errors. We only bother to do this at load time, because it's OK if * it happened before we were loaded (first time after boot/reset), * but any time after that, it's fatal anyway. Also need to not check - * for for upper byte errors if we are in 8 bit mode, so figure out + * for upper byte errors if we are in 8 bit mode, so figure out * our width. For now, at least, also complain if it's 8 bit. */ static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c index c9523e48c6a..adb09e2ba39 100644 --- a/drivers/input/keyboard/atkbd.c +++ b/drivers/input/keyboard/atkbd.c @@ -229,7 +229,7 @@ struct atkbd { }; /* - * System-specific ketymap fixup routine + * System-specific keymap fixup routine */ static void (*atkbd_platform_fixup)(struct atkbd *, const void *data); static void *atkbd_platform_fixup_data; diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 1a50be379cb..76d6751f89a 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -222,6 +222,22 @@ config INPUT_SGI_BTNS To compile this driver as a module, choose M here: the module will be called sgi_btns. +config INPUT_WINBOND_CIR + tristate "Winbond IR remote control" + depends on X86 && PNP + select LEDS_CLASS + select BITREVERSE + help + Say Y here if you want to use the IR remote functionality found + in some Winbond SuperI/O chips. Currently only the WPCD376I + chip is supported (included in some Intel Media series motherboards). + + IR Receive and wake-on-IR from suspend and power-off is currently + supported. + + To compile this driver as a module, choose M here: the module will be + called winbond_cir. + config HP_SDC_RTC tristate "HP SDC Real Time Clock" depends on (GSC || HP300) && SERIO diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index bf4db626c31..a8b84854fb7 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_INPUT_SGI_BTNS) += sgi_btns.o obj-$(CONFIG_INPUT_SPARCSPKR) += sparcspkr.o obj-$(CONFIG_INPUT_TWL4030_PWRBUTTON) += twl4030-pwrbutton.o obj-$(CONFIG_INPUT_UINPUT) += uinput.o +obj-$(CONFIG_INPUT_WINBOND_CIR) += winbond-cir.o obj-$(CONFIG_INPUT_WISTRON_BTNS) += wistron_btns.o obj-$(CONFIG_INPUT_WM831X_ON) += wm831x-on.o obj-$(CONFIG_INPUT_YEALINK) += yealink.o diff --git a/drivers/input/misc/winbond-cir.c b/drivers/input/misc/winbond-cir.c new file mode 100644 index 00000000000..33309fe44e2 --- /dev/null +++ b/drivers/input/misc/winbond-cir.c @@ -0,0 +1,1614 @@ +/* + * winbond-cir.c - Driver for the Consumer IR functionality of Winbond + * SuperI/O chips. + * + * Currently supports the Winbond WPCD376i chip (PNP id WEC1022), but + * could probably support others (Winbond WEC102X, NatSemi, etc) + * with minor modifications. + * + * Original Author: David Härdeman <david@hardeman.nu> + * Copyright (C) 2009 David Härdeman <david@hardeman.nu> + * + * Dedicated to Matilda, my newborn daughter, without whose loving attention + * this driver would have been finished in half the time and with a fraction + * of the bugs. + * + * Written using: + * o Winbond WPCD376I datasheet helpfully provided by Jesse Barnes at Intel + * o NatSemi PC87338/PC97338 datasheet (for the serial port stuff) + * o DSDT dumps + * + * Supported features: + * o RC6 + * o Wake-On-CIR functionality + * + * To do: + * o Test NEC and RC5 + * + * Left as an exercise for the reader: + * o Learning (I have neither the hardware, nor the need) + * o IR Transmit (ibid) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/module.h> +#include <linux/pnp.h> +#include <linux/interrupt.h> +#include <linux/timer.h> +#include <linux/input.h> +#include <linux/leds.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/pci_ids.h> +#include <linux/io.h> +#include <linux/bitrev.h> +#include <linux/bitops.h> + +#define DRVNAME "winbond-cir" + +/* CEIR Wake-Up Registers, relative to data->wbase */ +#define WBCIR_REG_WCEIR_CTL 0x03 /* CEIR Receiver Control */ +#define WBCIR_REG_WCEIR_STS 0x04 /* CEIR Receiver Status */ +#define WBCIR_REG_WCEIR_EV_EN 0x05 /* CEIR Receiver Event Enable */ +#define WBCIR_REG_WCEIR_CNTL 0x06 /* CEIR Receiver Counter Low */ +#define WBCIR_REG_WCEIR_CNTH 0x07 /* CEIR Receiver Counter High */ +#define WBCIR_REG_WCEIR_INDEX 0x08 /* CEIR Receiver Index */ +#define WBCIR_REG_WCEIR_DATA 0x09 /* CEIR Receiver Data */ +#define WBCIR_REG_WCEIR_CSL 0x0A /* CEIR Re. Compare Strlen */ +#define WBCIR_REG_WCEIR_CFG1 0x0B /* CEIR Re. Configuration 1 */ +#define WBCIR_REG_WCEIR_CFG2 0x0C /* CEIR Re. Configuration 2 */ + +/* CEIR Enhanced Functionality Registers, relative to data->ebase */ +#define WBCIR_REG_ECEIR_CTS 0x00 /* Enhanced IR Control Status */ +#define WBCIR_REG_ECEIR_CCTL 0x01 /* Infrared Counter Control */ +#define WBCIR_REG_ECEIR_CNT_LO 0x02 /* Infrared Counter LSB */ +#define WBCIR_REG_ECEIR_CNT_HI 0x03 /* Infrared Counter MSB */ +#define WBCIR_REG_ECEIR_IREM 0x04 /* Infrared Emitter Status */ + +/* SP3 Banked Registers, relative to data->sbase */ +#define WBCIR_REG_SP3_BSR 0x03 /* Bank Select, all banks */ + /* Bank 0 */ +#define WBCIR_REG_SP3_RXDATA 0x00 /* FIFO RX data (r) */ +#define WBCIR_REG_SP3_TXDATA 0x00 /* FIFO TX data (w) */ +#define WBCIR_REG_SP3_IER 0x01 /* Interrupt Enable */ +#define WBCIR_REG_SP3_EIR 0x02 /* Event Identification (r) */ +#define WBCIR_REG_SP3_FCR 0x02 /* FIFO Control (w) */ +#define WBCIR_REG_SP3_MCR 0x04 /* Mode Control */ +#define WBCIR_REG_SP3_LSR 0x05 /* Link Status */ +#define WBCIR_REG_SP3_MSR 0x06 /* Modem Status */ +#define WBCIR_REG_SP3_ASCR 0x07 /* Aux Status and Control */ + /* Bank 2 */ +#define WBCIR_REG_SP3_BGDL 0x00 /* Baud Divisor LSB */ +#define WBCIR_REG_SP3_BGDH 0x01 /* Baud Divisor MSB */ +#define WBCIR_REG_SP3_EXCR1 0x02 /* Extended Control 1 */ +#define WBCIR_REG_SP3_EXCR2 0x04 /* Extended Control 2 */ +#define WBCIR_REG_SP3_TXFLV 0x06 /* TX FIFO Level */ +#define WBCIR_REG_SP3_RXFLV 0x07 /* RX FIFO Level */ + /* Bank 3 */ +#define WBCIR_REG_SP3_MRID 0x00 /* Module Identification */ +#define WBCIR_REG_SP3_SH_LCR 0x01 /* LCR Shadow */ +#define WBCIR_REG_SP3_SH_FCR 0x02 /* FCR Shadow */ + /* Bank 4 */ +#define WBCIR_REG_SP3_IRCR1 0x02 /* Infrared Control 1 */ + /* Bank 5 */ +#define WBCIR_REG_SP3_IRCR2 0x04 /* Infrared Control 2 */ + /* Bank 6 */ +#define WBCIR_REG_SP3_IRCR3 0x00 /* Infrared Control 3 */ +#define WBCIR_REG_SP3_SIR_PW 0x02 /* SIR Pulse Width */ + /* Bank 7 */ +#define WBCIR_REG_SP3_IRRXDC 0x00 /* IR RX Demod Control */ +#define WBCIR_REG_SP3_IRTXMC 0x01 /* IR TX Mod Control */ +#define WBCIR_REG_SP3_RCCFG 0x02 /* CEIR Config */ +#define WBCIR_REG_SP3_IRCFG1 0x04 /* Infrared Config 1 */ +#define WBCIR_REG_SP3_IRCFG4 0x07 /* Infrared Config 4 */ + +/* + * Magic values follow + */ + +/* No interrupts for WBCIR_REG_SP3_IER and WBCIR_REG_SP3_EIR */ +#define WBCIR_IRQ_NONE 0x00 +/* RX data bit for WBCIR_REG_SP3_IER and WBCIR_REG_SP3_EIR */ +#define WBCIR_IRQ_RX 0x01 +/* Over/Under-flow bit for WBCIR_REG_SP3_IER and WBCIR_REG_SP3_EIR */ +#define WBCIR_IRQ_ERR 0x04 +/* Led enable/disable bit for WBCIR_REG_ECEIR_CTS */ +#define WBCIR_LED_ENABLE 0x80 +/* RX data available bit for WBCIR_REG_SP3_LSR */ +#define WBCIR_RX_AVAIL 0x01 +/* RX disable bit for WBCIR_REG_SP3_ASCR */ +#define WBCIR_RX_DISABLE 0x20 +/* Extended mode enable bit for WBCIR_REG_SP3_EXCR1 */ +#define WBCIR_EXT_ENABLE 0x01 +/* Select compare register in WBCIR_REG_WCEIR_INDEX (bits 5 & 6) */ +#define WBCIR_REGSEL_COMPARE 0x10 +/* Select mask register in WBCIR_REG_WCEIR_INDEX (bits 5 & 6) */ +#define WBCIR_REGSEL_MASK 0x20 +/* Starting address of selected register in WBCIR_REG_WCEIR_INDEX */ +#define WBCIR_REG_ADDR0 0x00 + +/* Valid banks for the SP3 UART */ +enum wbcir_bank { + WBCIR_BANK_0 = 0x00, + WBCIR_BANK_1 = 0x80, + WBCIR_BANK_2 = 0xE0, + WBCIR_BANK_3 = 0xE4, + WBCIR_BANK_4 = 0xE8, + WBCIR_BANK_5 = 0xEC, + WBCIR_BANK_6 = 0xF0, + WBCIR_BANK_7 = 0xF4, +}; + +/* Supported IR Protocols */ +enum wbcir_protocol { + IR_PROTOCOL_RC5 = 0x0, + IR_PROTOCOL_NEC = 0x1, + IR_PROTOCOL_RC6 = 0x2, +}; + +/* Misc */ +#define WBCIR_NAME "Winbond CIR" +#define WBCIR_ID_FAMILY 0xF1 /* Family ID for the WPCD376I */ +#define WBCIR_ID_CHIP 0x04 /* Chip ID for the WPCD376I */ +#define IR_KEYPRESS_TIMEOUT 250 /* FIXME: should be per-protocol? */ +#define INVALID_SCANCODE 0x7FFFFFFF /* Invalid with all protos */ +#define WAKEUP_IOMEM_LEN 0x10 /* Wake-Up I/O Reg Len */ +#define EHFUNC_IOMEM_LEN 0x10 /* Enhanced Func I/O Reg Len */ +#define SP_IOMEM_LEN 0x08 /* Serial Port 3 (IR) Reg Len */ +#define WBCIR_MAX_IDLE_BYTES 10 + +static DEFINE_SPINLOCK(wbcir_lock); +static DEFINE_RWLOCK(keytable_lock); + +struct wbcir_key { + u32 scancode; + unsigned int keycode; +}; + +struct wbcir_keyentry { + struct wbcir_key key; + struct list_head list; +}; + +static struct wbcir_key rc6_def_keymap[] = { + { 0x800F0400, KEY_NUMERIC_0 }, + { 0x800F0401, KEY_NUMERIC_1 }, + { 0x800F0402, KEY_NUMERIC_2 }, + { 0x800F0403, KEY_NUMERIC_3 }, + { 0x800F0404, KEY_NUMERIC_4 }, + { 0x800F0405, KEY_NUMERIC_5 }, + { 0x800F0406, KEY_NUMERIC_6 }, + { 0x800F0407, KEY_NUMERIC_7 }, + { 0x800F0408, KEY_NUMERIC_8 }, + { 0x800F0409, KEY_NUMERIC_9 }, + { 0x800F041D, KEY_NUMERIC_STAR }, + { 0x800F041C, KEY_NUMERIC_POUND }, + { 0x800F0410, KEY_VOLUMEUP }, + { 0x800F0411, KEY_VOLUMEDOWN }, + { 0x800F0412, KEY_CHANNELUP }, + { 0x800F0413, KEY_CHANNELDOWN }, + { 0x800F040E, KEY_MUTE }, + { 0x800F040D, KEY_VENDOR }, /* Vista Logo Key */ + { 0x800F041E, KEY_UP }, + { 0x800F041F, KEY_DOWN }, + { 0x800F0420, KEY_LEFT }, + { 0x800F0421, KEY_RIGHT }, + { 0x800F0422, KEY_OK }, + { 0x800F0423, KEY_ESC }, + { 0x800F040F, KEY_INFO }, + { 0x800F040A, KEY_CLEAR }, + { 0x800F040B, KEY_ENTER }, + { 0x800F045B, KEY_RED }, + { 0x800F045C, KEY_GREEN }, + { 0x800F045D, KEY_YELLOW }, + { 0x800F045E, KEY_BLUE }, + { 0x800F045A, KEY_TEXT }, + { 0x800F0427, KEY_SWITCHVIDEOMODE }, + { 0x800F040C, KEY_POWER }, + { 0x800F0450, KEY_RADIO }, + { 0x800F0448, KEY_PVR }, + { 0x800F0447, KEY_AUDIO }, + { 0x800F0426, KEY_EPG }, + { 0x800F0449, KEY_CAMERA }, + { 0x800F0425, KEY_TV }, + { 0x800F044A, KEY_VIDEO }, + { 0x800F0424, KEY_DVD }, + { 0x800F0416, KEY_PLAY }, + { 0x800F0418, KEY_PAUSE }, + { 0x800F0419, KEY_STOP }, + { 0x800F0414, KEY_FASTFORWARD }, + { 0x800F041A, KEY_NEXT }, + { 0x800F041B, KEY_PREVIOUS }, + { 0x800F0415, KEY_REWIND }, + { 0x800F0417, KEY_RECORD }, +}; + +/* Registers and other state is protected by wbcir_lock */ +struct wbcir_data { + unsigned long wbase; /* Wake-Up Baseaddr */ + unsigned long ebase; /* Enhanced Func. Baseaddr */ + unsigned long sbase; /* Serial Port Baseaddr */ + unsigned int irq; /* Serial Port IRQ */ + + struct input_dev *input_dev; + struct timer_list timer_keyup; + struct led_trigger *rxtrigger; + struct led_trigger *txtrigger; + struct led_classdev led; + + u32 last_scancode; + unsigned int last_keycode; + u8 last_toggle; + u8 keypressed; + unsigned long keyup_jiffies; + unsigned int idle_count; + + /* RX irdata and parsing state */ + unsigned long irdata[30]; + unsigned int irdata_count; + unsigned int irdata_idle; + unsigned int irdata_off; + unsigned int irdata_error; + + /* Protected by keytable_lock */ + struct list_head keytable; +}; + +static enum wbcir_protocol protocol = IR_PROTOCOL_RC6; +module_param(protocol, uint, 0444); +MODULE_PARM_DESC(protocol, "IR protocol to use " + "(0 = RC5, 1 = NEC, 2 = RC6A, default)"); + +static int invert; /* default = 0 */ +module_param(invert, bool, 0444); +MODULE_PARM_DESC(invert, "Invert the signal from the IR receiver"); + +static unsigned int wake_sc = 0x800F040C; +module_param(wake_sc, uint, 0644); +MODULE_PARM_DESC(wake_sc, "Scancode of the power-on IR command"); + +static unsigned int wake_rc6mode = 6; +module_param(wake_rc6mode, uint, 0644); +MODULE_PARM_DESC(wake_rc6mode, "RC6 mode for the power-on command " + "(0 = 0, 6 = 6A, default)"); + + + +/***************************************************************************** + * + * UTILITY FUNCTIONS + * + *****************************************************************************/ + +/* Caller needs to hold wbcir_lock */ +static void +wbcir_set_bits(unsigned long addr, u8 bits, u8 mask) +{ + u8 val; + + val = inb(addr); + val = ((val & ~mask) | (bits & mask)); + outb(val, addr); +} + +/* Selects the register bank for the serial port */ +static inline void +wbcir_select_bank(struct wbcir_data *data, enum wbcir_bank bank) +{ + outb(bank, data->sbase + WBCIR_REG_SP3_BSR); +} + +static enum led_brightness +wbcir_led_brightness_get(struct led_classdev *led_cdev) +{ + struct wbcir_data *data = container_of(led_cdev, + struct wbcir_data, + led); + + if (inb(data->ebase + WBCIR_REG_ECEIR_CTS) & WBCIR_LED_ENABLE) + return LED_FULL; + else + return LED_OFF; +} + +static void +wbcir_led_brightness_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct wbcir_data *data = container_of(led_cdev, + struct wbcir_data, + led); + + wbcir_set_bits(data->ebase + WBCIR_REG_ECEIR_CTS, + brightness == LED_OFF ? 0x00 : WBCIR_LED_ENABLE, + WBCIR_LED_ENABLE); +} + +/* Manchester encodes bits to RC6 message cells (see wbcir_parse_rc6) */ +static u8 +wbcir_to_rc6cells(u8 val) +{ + u8 coded = 0x00; + int i; + + val &= 0x0F; + for (i = 0; i < 4; i++) { + if (val & 0x01) + coded |= 0x02 << (i * 2); + else + coded |= 0x01 << (i * 2); + val >>= 1; + } + + return coded; +} + + + +/***************************************************************************** + * + * INPUT FUNCTIONS + * + *****************************************************************************/ + +static unsigned int +wbcir_do_getkeycode(struct wbcir_data *data, u32 scancode) +{ + struct wbcir_keyentry *keyentry; + unsigned int keycode = KEY_RESERVED; + unsigned long flags; + + read_lock_irqsave(&keytable_lock, flags); + + list_for_each_entry(keyentry, &data->keytable, list) { + if (keyentry->key.scancode == scancode) { + keycode = keyentry->key.keycode; + break; + } + } + + read_unlock_irqrestore(&keytable_lock, flags); + return keycode; +} + +static int +wbcir_getkeycode(struct input_dev *dev, int scancode, int *keycode) +{ + struct wbcir_data *data = input_get_drvdata(dev); + + *keycode = (int)wbcir_do_getkeycode(data, (u32)scancode); + return 0; +} + +static int +wbcir_setkeycode(struct input_dev *dev, int sscancode, int keycode) +{ + struct wbcir_data *data = input_get_drvdata(dev); + struct wbcir_keyentry *keyentry; + struct wbcir_keyentry *new_keyentry; + unsigned long flags; + unsigned int old_keycode = KEY_RESERVED; + u32 scancode = (u32)sscancode; + + if (keycode < 0 || keycode > KEY_MAX) + return -EINVAL; + + new_keyentry = kmalloc(sizeof(*new_keyentry), GFP_KERNEL); + if (!new_keyentry) + return -ENOMEM; + + write_lock_irqsave(&keytable_lock, flags); + + list_for_each_entry(keyentry, &data->keytable, list) { + if (keyentry->key.scancode != scancode) + continue; + + old_keycode = keyentry->key.keycode; + keyentry->key.keycode = keycode; + + if (keyentry->key.keycode == KEY_RESERVED) { + list_del(&keyentry->list); + kfree(keyentry); + } + + break; + } + + set_bit(keycode, dev->keybit); + + if (old_keycode == KEY_RESERVED) { + new_keyentry->key.scancode = scancode; + new_keyentry->key.keycode = keycode; + list_add(&new_keyentry->list, &data->keytable); + } else { + kfree(new_keyentry); + clear_bit(old_keycode, dev->keybit); + list_for_each_entry(keyentry, &data->keytable, list) { + if (keyentry->key.keycode == old_keycode) { + set_bit(old_keycode, dev->keybit); + break; + } + } + } + + write_unlock_irqrestore(&keytable_lock, flags); + return 0; +} + +/* + * Timer function to report keyup event some time after keydown is + * reported by the ISR. + */ +static void +wbcir_keyup(unsigned long cookie) +{ + struct wbcir_data *data = (struct wbcir_data *)cookie; + unsigned long flags; + + /* + * data->keyup_jiffies is used to prevent a race condition if a + * hardware interrupt occurs at this point and the keyup timer + * event is moved further into the future as a result. + * + * The timer will then be reactivated and this function called + * again in the future. We need to exit gracefully in that case + * to allow the input subsystem to do its auto-repeat magic or + * a keyup event might follow immediately after the keydown. + */ + + spin_lock_irqsave(&wbcir_lock, flags); + + if (time_is_after_eq_jiffies(data->keyup_jiffies) && data->keypressed) { + data->keypressed = 0; + led_trigger_event(data->rxtrigger, LED_OFF); + input_report_key(data->input_dev, data->last_keycode, 0); + input_sync(data->input_dev); + } + + spin_unlock_irqrestore(&wbcir_lock, flags); +} + +static void +wbcir_keydown(struct wbcir_data *data, u32 scancode, u8 toggle) +{ + unsigned int keycode; + + /* Repeat? */ + if (data->last_scancode == scancode && + data->last_toggle == toggle && + data->keypressed) + goto set_timer; + data->last_scancode = scancode; + + /* Do we need to release an old keypress? */ + if (data->keypressed) { + input_report_key(data->input_dev, data->last_keycode, 0); + input_sync(data->input_dev); + data->keypressed = 0; + } + + /* Report scancode */ + input_event(data->input_dev, EV_MSC, MSC_SCAN, (int)scancode); + + /* Do we know this scancode? */ + keycode = wbcir_do_getkeycode(data, scancode); + if (keycode == KEY_RESERVED) + goto set_timer; + + /* Register a keypress */ + input_report_key(data->input_dev, keycode, 1); + data->keypressed = 1; + data->last_keycode = keycode; + data->last_toggle = toggle; + +set_timer: + input_sync(data->input_dev); + led_trigger_event(data->rxtrigger, + data->keypressed ? LED_FULL : LED_OFF); + data->keyup_jiffies = jiffies + msecs_to_jiffies(IR_KEYPRESS_TIMEOUT); + mod_timer(&data->timer_keyup, data->keyup_jiffies); +} + + + +/***************************************************************************** + * + * IR PARSING FUNCTIONS + * + *****************************************************************************/ + +/* Resets all irdata */ +static void +wbcir_reset_irdata(struct wbcir_data *data) +{ + memset(data->irdata, 0, sizeof(data->irdata)); + data->irdata_count = 0; + data->irdata_off = 0; + data->irdata_error = 0; +} + +/* Adds one bit of irdata */ +static void +add_irdata_bit(struct wbcir_data *data, int set) +{ + if (data->irdata_count >= sizeof(data->irdata) * 8) { + data->irdata_error = 1; + return; + } + + if (set) + __set_bit(data->irdata_count, data->irdata); + data->irdata_count++; +} + +/* Gets count bits of irdata */ +static u16 +get_bits(struct wbcir_data *data, int count) +{ + u16 val = 0x0; + + if (data->irdata_count - data->irdata_off < count) { + data->irdata_error = 1; + return 0x0; + } + + while (count > 0) { + val <<= 1; + if (test_bit(data->irdata_off, data->irdata)) + val |= 0x1; + count--; + data->irdata_off++; + } + + return val; +} + +/* Reads 16 cells and converts them to a byte */ +static u8 +wbcir_rc6cells_to_byte(struct wbcir_data *data) +{ + u16 raw = get_bits(data, 16); + u8 val = 0x00; + int bit; + + for (bit = 0; bit < 8; bit++) { + switch (raw & 0x03) { + case 0x01: + break; + case 0x02: + val |= (0x01 << bit); + break; + default: + data->irdata_error = 1; + break; + } + raw >>= 2; + } + + return val; +} + +/* Decodes a number of bits from raw RC5 data */ +static u8 +wbcir_get_rc5bits(struct wbcir_data *data, unsigned int count) +{ + u16 raw = get_bits(data, count * 2); + u8 val = 0x00; + int bit; + + for (bit = 0; bit < count; bit++) { + switch (raw & 0x03) { + case 0x01: + val |= (0x01 << bit); + break; + case 0x02: + break; + default: + data->irdata_error = 1; + break; + } + raw >>= 2; + } + + return val; +} + +static void +wbcir_parse_rc6(struct device *dev, struct wbcir_data *data) +{ + /* + * Normal bits are manchester coded as follows: + * cell0 + cell1 = logic "0" + * cell1 + cell0 = logic "1" + * + * The IR pulse has the following components: + * + * Leader - 6 * cell1 - discarded + * Gap - 2 * cell0 - discarded + * Start bit - Normal Coding - always "1" + * Mode Bit 2 - 0 - Normal Coding + * Toggle bit - Normal Coding with double bit time, + * e.g. cell0 + cell0 + cell1 + cell1 + * means logic "0". + * + * The rest depends on the mode, the following modes are known: + * + * MODE 0: + * Address Bit 7 - 0 - Normal Coding + * Command Bit 7 - 0 - Normal Coding + * + * MODE 6: + * The above Toggle Bit is used as a submode bit, 0 = A, 1 = B. + * Submode B is for pointing devices, only remotes using submode A + * are supported. + * + * Customer range bit - 0 => Customer = 7 bits, 0...127 + * 1 => Customer = 15 bits, 32768...65535 + * Customer Bits - Normal Coding + * + * Customer codes are allocated by Philips. The rest of the bits + * are customer dependent. The following is commonly used (and the + * only supported config): + * + * Toggle Bit - Normal Coding + * Address Bit 6 - 0 - Normal Coding + * Command Bit 7 - 0 - Normal Coding + * + * All modes are followed by at least 6 * cell0. + * + * MODE 0 msglen: + * 1 * 2 (start bit) + 3 * 2 (mode) + 2 * 2 (toggle) + + * 8 * 2 (address) + 8 * 2 (command) = + * 44 cells + * + * MODE 6A msglen: + * 1 * 2 (start bit) + 3 * 2 (mode) + 2 * 2 (submode) + + * 1 * 2 (customer range bit) + 7/15 * 2 (customer bits) + + * 1 * 2 (toggle bit) + 7 * 2 (address) + 8 * 2 (command) = + * 60 - 76 cells + */ + u8 mode; + u8 toggle; + u16 customer = 0x0; + u8 address; + u8 command; + u32 scancode; + + /* Leader mark */ + while (get_bits(data, 1) && !data->irdata_error) + /* Do nothing */; + + /* Leader space */ + if (get_bits(data, 1)) { + dev_dbg(dev, "RC6 - Invalid leader space\n"); + return; + } + + /* Start bit */ + if (get_bits(data, 2) != 0x02) { + dev_dbg(dev, "RC6 - Invalid start bit\n"); + return; + } + + /* Mode */ + mode = get_bits(data, 6); + switch (mode) { + case 0x15: /* 010101 = b000 */ + mode = 0; + break; + case 0x29: /* 101001 = b110 */ + mode = 6; + break; + default: + dev_dbg(dev, "RC6 - Invalid mode\n"); + return; + } + + /* Toggle bit / Submode bit */ + toggle = get_bits(data, 4); + switch (toggle) { + case 0x03: + toggle = 0; + break; + case 0x0C: + toggle = 1; + break; + default: + dev_dbg(dev, "RC6 - Toggle bit error\n"); + break; + } + + /* Customer */ + if (mode == 6) { + if (toggle != 0) { + dev_dbg(dev, "RC6B - Not Supported\n"); + return; + } + + customer = wbcir_rc6cells_to_byte(data); + + if (customer & 0x80) { + /* 15 bit customer value */ + customer <<= 8; + customer |= wbcir_rc6cells_to_byte(data); + } + } + + /* Address */ + address = wbcir_rc6cells_to_byte(data); + if (mode == 6) { + toggle = address >> 7; + address &= 0x7F; + } + + /* Command */ + command = wbcir_rc6cells_to_byte(data); + + /* Create scancode */ + scancode = command; + scancode |= address << 8; + scancode |= customer << 16; + + /* Last sanity check */ + if (data->irdata_error) { + dev_dbg(dev, "RC6 - Cell error(s)\n"); + return; + } + + dev_info(dev, "IR-RC6 ad 0x%02X cm 0x%02X cu 0x%04X " + "toggle %u mode %u scan 0x%08X\n", + address, + command, + customer, + (unsigned int)toggle, + (unsigned int)mode, + scancode); + + wbcir_keydown(data, scancode, toggle); +} + +static void +wbcir_parse_rc5(struct device *dev, struct wbcir_data *data) +{ + /* + * Bits are manchester coded as follows: + * cell1 + cell0 = logic "0" + * cell0 + cell1 = logic "1" + * (i.e. the reverse of RC6) + * + * Start bit 1 - "1" - discarded + * Start bit 2 - Must be inverted to get command bit 6 + * Toggle bit + * Address Bit 4 - 0 + * Command Bit 5 - 0 + */ + u8 toggle; + u8 address; + u8 command; + u32 scancode; + + /* Start bit 1 */ + if (!get_bits(data, 1)) { + dev_dbg(dev, "RC5 - Invalid start bit\n"); + return; + } + + /* Start bit 2 */ + if (!wbcir_get_rc5bits(data, 1)) + command = 0x40; + else + command = 0x00; + + toggle = wbcir_get_rc5bits(data, 1); + address = wbcir_get_rc5bits(data, 5); + command |= wbcir_get_rc5bits(data, 6); + scancode = address << 7 | command; + + /* Last sanity check */ + if (data->irdata_error) { + dev_dbg(dev, "RC5 - Invalid message\n"); + return; + } + + dev_dbg(dev, "IR-RC5 ad %u cm %u t %u s %u\n", + (unsigned int)address, + (unsigned int)command, + (unsigned int)toggle, + (unsigned int)scancode); + + wbcir_keydown(data, scancode, toggle); +} + +static void +wbcir_parse_nec(struct device *dev, struct wbcir_data *data) +{ + /* + * Each bit represents 560 us. + * + * Leader - 9 ms burst + * Gap - 4.5 ms silence + * Address1 bit 0 - 7 - Address 1 + * Address2 bit 0 - 7 - Address 2 + * Command1 bit 0 - 7 - Command 1 + * Command2 bit 0 - 7 - Command 2 + * + * Note the bit order! + * + * With the old NEC protocol, Address2 was the inverse of Address1 + * and Command2 was the inverse of Command1 and were used as + * an error check. + * + * With NEC extended, Address1 is the LSB of the Address and + * Address2 is the MSB, Command parsing remains unchanged. + * + * A repeat message is coded as: + * Leader - 9 ms burst + * Gap - 2.25 ms silence + * Repeat - 560 us active + */ + u8 address1; + u8 address2; + u8 command1; + u8 command2; + u16 address; + u32 scancode; + + /* Leader mark */ + while (get_bits(data, 1) && !data->irdata_error) + /* Do nothing */; + + /* Leader space */ + if (get_bits(data, 4)) { + dev_dbg(dev, "NEC - Invalid leader space\n"); + return; + } + + /* Repeat? */ + if (get_bits(data, 1)) { + if (!data->keypressed) { + dev_dbg(dev, "NEC - Stray repeat message\n"); + return; + } + + dev_dbg(dev, "IR-NEC repeat s %u\n", + (unsigned int)data->last_scancode); + + wbcir_keydown(data, data->last_scancode, data->last_toggle); + return; + } + + /* Remaining leader space */ + if (get_bits(data, 3)) { + dev_dbg(dev, "NEC - Invalid leader space\n"); + return; + } + + address1 = bitrev8(get_bits(data, 8)); + address2 = bitrev8(get_bits(data, 8)); + command1 = bitrev8(get_bits(data, 8)); + command2 = bitrev8(get_bits(data, 8)); + + /* Sanity check */ + if (data->irdata_error) { + dev_dbg(dev, "NEC - Invalid message\n"); + return; + } + + /* Check command validity */ + if (command1 != ~command2) { + dev_dbg(dev, "NEC - Command bytes mismatch\n"); + return; + } + + /* Check for extended NEC protocol */ + address = address1; + if (address1 != ~address2) + address |= address2 << 8; + + scancode = address << 8 | command1; + + dev_dbg(dev, "IR-NEC ad %u cm %u s %u\n", + (unsigned int)address, + (unsigned int)command1, + (unsigned int)scancode); + + wbcir_keydown(data, scancode, !data->last_toggle); +} + + + +/***************************************************************************** + * + * INTERRUPT FUNCTIONS + * + *****************************************************************************/ + +static irqreturn_t +wbcir_irq_handler(int irqno, void *cookie) +{ + struct pnp_dev *device = cookie; + struct wbcir_data *data = pnp_get_drvdata(device); + struct device *dev = &device->dev; + u8 status; + unsigned long flags; + u8 irdata[8]; + int i; + unsigned int hw; + + spin_lock_irqsave(&wbcir_lock, flags); + + wbcir_select_bank(data, WBCIR_BANK_0); + + status = inb(data->sbase + WBCIR_REG_SP3_EIR); + + if (!(status & (WBCIR_IRQ_RX | WBCIR_IRQ_ERR))) { + spin_unlock_irqrestore(&wbcir_lock, flags); + return IRQ_NONE; + } + + if (status & WBCIR_IRQ_ERR) + data->irdata_error = 1; + + if (!(status & WBCIR_IRQ_RX)) + goto out; + + /* Since RXHDLEV is set, at least 8 bytes are in the FIFO */ + insb(data->sbase + WBCIR_REG_SP3_RXDATA, &irdata[0], 8); + + for (i = 0; i < sizeof(irdata); i++) { + hw = hweight8(irdata[i]); + if (hw > 4) + add_irdata_bit(data, 0); + else + add_irdata_bit(data, 1); + + if (hw == 8) + data->idle_count++; + else + data->idle_count = 0; + } + + if (data->idle_count > WBCIR_MAX_IDLE_BYTES) { + /* Set RXINACTIVE... */ + outb(WBCIR_RX_DISABLE, data->sbase + WBCIR_REG_SP3_ASCR); + + /* ...and drain the FIFO */ + while (inb(data->sbase + WBCIR_REG_SP3_LSR) & WBCIR_RX_AVAIL) + inb(data->sbase + WBCIR_REG_SP3_RXDATA); + + dev_dbg(dev, "IRDATA:\n"); + for (i = 0; i < data->irdata_count; i += BITS_PER_LONG) + dev_dbg(dev, "0x%08lX\n", data->irdata[i/BITS_PER_LONG]); + + switch (protocol) { + case IR_PROTOCOL_RC5: + wbcir_parse_rc5(dev, data); + break; + case IR_PROTOCOL_RC6: + wbcir_parse_rc6(dev, data); + break; + case IR_PROTOCOL_NEC: + wbcir_parse_nec(dev, data); + break; + } + + wbcir_reset_irdata(data); + data->idle_count = 0; + } + +out: + spin_unlock_irqrestore(&wbcir_lock, flags); + return IRQ_HANDLED; +} + + + +/***************************************************************************** + * + * SUSPEND/RESUME FUNCTIONS + * + *****************************************************************************/ + +static void +wbcir_shutdown(struct pnp_dev *device) +{ + struct device *dev = &device->dev; + struct wbcir_data *data = pnp_get_drvdata(device); + int do_wake = 1; + u8 match[11]; + u8 mask[11]; + u8 rc6_csl = 0; + int i; + + memset(match, 0, sizeof(match)); + memset(mask, 0, sizeof(mask)); + + if (wake_sc == INVALID_SCANCODE || !device_may_wakeup(dev)) { + do_wake = 0; + goto finish; + } + + switch (protocol) { + case IR_PROTOCOL_RC5: + if (wake_sc > 0xFFF) { + do_wake = 0; + dev_err(dev, "RC5 - Invalid wake scancode\n"); + break; + } + + /* Mask = 13 bits, ex toggle */ + mask[0] = 0xFF; + mask[1] = 0x17; + + match[0] = (wake_sc & 0x003F); /* 6 command bits */ + match[0] |= (wake_sc & 0x0180) >> 1; /* 2 address bits */ + match[1] = (wake_sc & 0x0E00) >> 9; /* 3 address bits */ + if (!(wake_sc & 0x0040)) /* 2nd start bit */ + match[1] |= 0x10; + + break; + + case IR_PROTOCOL_NEC: + if (wake_sc > 0xFFFFFF) { + do_wake = 0; + dev_err(dev, "NEC - Invalid wake scancode\n"); + break; + } + + mask[0] = mask[1] = mask[2] = mask[3] = 0xFF; + + match[1] = bitrev8((wake_sc & 0xFF)); + match[0] = ~match[1]; + + match[3] = bitrev8((wake_sc & 0xFF00) >> 8); + if (wake_sc > 0xFFFF) + match[2] = bitrev8((wake_sc & 0xFF0000) >> 16); + else + match[2] = ~match[3]; + + break; + + case IR_PROTOCOL_RC6: + + if (wake_rc6mode == 0) { + if (wake_sc > 0xFFFF) { + do_wake = 0; + dev_err(dev, "RC6 - Invalid wake scancode\n"); + break; + } + + /* Command */ + match[0] = wbcir_to_rc6cells(wake_sc >> 0); + mask[0] = 0xFF; + match[1] = wbcir_to_rc6cells(wake_sc >> 4); + mask[1] = 0xFF; + + /* Address */ + match[2] = wbcir_to_rc6cells(wake_sc >> 8); + mask[2] = 0xFF; + match[3] = wbcir_to_rc6cells(wake_sc >> 12); + mask[3] = 0xFF; + + /* Header */ + match[4] = 0x50; /* mode1 = mode0 = 0, ignore toggle */ + mask[4] = 0xF0; + match[5] = 0x09; /* start bit = 1, mode2 = 0 */ + mask[5] = 0x0F; + + rc6_csl = 44; + + } else if (wake_rc6mode == 6) { + i = 0; + + /* Command */ + match[i] = wbcir_to_rc6cells(wake_sc >> 0); + mask[i++] = 0xFF; + match[i] = wbcir_to_rc6cells(wake_sc >> 4); + mask[i++] = 0xFF; + + /* Address + Toggle */ + match[i] = wbcir_to_rc6cells(wake_sc >> 8); + mask[i++] = 0xFF; + match[i] = wbcir_to_rc6cells(wake_sc >> 12); + mask[i++] = 0x3F; + + /* Customer bits 7 - 0 */ + match[i] = wbcir_to_rc6cells(wake_sc >> 16); + mask[i++] = 0xFF; + match[i] = wbcir_to_rc6cells(wake_sc >> 20); + mask[i++] = 0xFF; + + if (wake_sc & 0x80000000) { + /* Customer range bit and bits 15 - 8 */ + match[i] = wbcir_to_rc6cells(wake_sc >> 24); + mask[i++] = 0xFF; + match[i] = wbcir_to_rc6cells(wake_sc >> 28); + mask[i++] = 0xFF; + rc6_csl = 76; + } else if (wake_sc <= 0x007FFFFF) { + rc6_csl = 60; + } else { + do_wake = 0; + dev_err(dev, "RC6 - Invalid wake scancode\n"); + break; + } + + /* Header */ + match[i] = 0x93; /* mode1 = mode0 = 1, submode = 0 */ + mask[i++] = 0xFF; + match[i] = 0x0A; /* start bit = 1, mode2 = 1 */ + mask[i++] = 0x0F; + + } else { + do_wake = 0; + dev_err(dev, "RC6 - Invalid wake mode\n"); + } + + break; + + default: + do_wake = 0; + break; + } + +finish: + if (do_wake) { + /* Set compare and compare mask */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_INDEX, + WBCIR_REGSEL_COMPARE | WBCIR_REG_ADDR0, + 0x3F); + outsb(data->wbase + WBCIR_REG_WCEIR_DATA, match, 11); + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_INDEX, + WBCIR_REGSEL_MASK | WBCIR_REG_ADDR0, + 0x3F); + outsb(data->wbase + WBCIR_REG_WCEIR_DATA, mask, 11); + + /* RC6 Compare String Len */ + outb(rc6_csl, data->wbase + WBCIR_REG_WCEIR_CSL); + + /* Clear status bits NEC_REP, BUFF, MSG_END, MATCH */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_STS, 0x17, 0x17); + + /* Clear BUFF_EN, Clear END_EN, Set MATCH_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_EV_EN, 0x01, 0x07); + + /* Set CEIR_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_CTL, 0x01, 0x01); + + } else { + /* Clear BUFF_EN, Clear END_EN, Clear MATCH_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_EV_EN, 0x00, 0x07); + + /* Clear CEIR_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_CTL, 0x00, 0x01); + } + + /* Disable interrupts */ + outb(WBCIR_IRQ_NONE, data->sbase + WBCIR_REG_SP3_IER); +} + +static int +wbcir_suspend(struct pnp_dev *device, pm_message_t state) +{ + wbcir_shutdown(device); + return 0; +} + +static int +wbcir_resume(struct pnp_dev *device) +{ + struct wbcir_data *data = pnp_get_drvdata(device); + + /* Clear BUFF_EN, Clear END_EN, Clear MATCH_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_EV_EN, 0x00, 0x07); + + /* Clear CEIR_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_CTL, 0x00, 0x01); + + /* Enable interrupts */ + wbcir_reset_irdata(data); + outb(WBCIR_IRQ_RX | WBCIR_IRQ_ERR, data->sbase + WBCIR_REG_SP3_IER); + + return 0; +} + + + +/***************************************************************************** + * + * SETUP/INIT FUNCTIONS + * + *****************************************************************************/ + +static void +wbcir_cfg_ceir(struct wbcir_data *data) +{ + u8 tmp; + + /* Set PROT_SEL, RX_INV, Clear CEIR_EN (needed for the led) */ + tmp = protocol << 4; + if (invert) + tmp |= 0x08; + outb(tmp, data->wbase + WBCIR_REG_WCEIR_CTL); + + /* Clear status bits NEC_REP, BUFF, MSG_END, MATCH */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_STS, 0x17, 0x17); + + /* Clear BUFF_EN, Clear END_EN, Clear MATCH_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_EV_EN, 0x00, 0x07); + + /* Set RC5 cell time to correspond to 36 kHz */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_CFG1, 0x4A, 0x7F); + + /* Set IRTX_INV */ + if (invert) + outb(0x04, data->ebase + WBCIR_REG_ECEIR_CCTL); + else + outb(0x00, data->ebase + WBCIR_REG_ECEIR_CCTL); + + /* + * Clear IR LED, set SP3 clock to 24Mhz + * set SP3_IRRX_SW to binary 01, helpfully not documented + */ + outb(0x10, data->ebase + WBCIR_REG_ECEIR_CTS); +} + +static int __devinit +wbcir_probe(struct pnp_dev *device, const struct pnp_device_id *dev_id) +{ + struct device *dev = &device->dev; + struct wbcir_data *data; + int err; + + if (!(pnp_port_len(device, 0) == EHFUNC_IOMEM_LEN && + pnp_port_len(device, 1) == WAKEUP_IOMEM_LEN && + pnp_port_len(device, 2) == SP_IOMEM_LEN)) { + dev_err(dev, "Invalid resources\n"); + return -ENODEV; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + err = -ENOMEM; + goto exit; + } + + pnp_set_drvdata(device, data); + + data->ebase = pnp_port_start(device, 0); + data->wbase = pnp_port_start(device, 1); + data->sbase = pnp_port_start(device, 2); + data->irq = pnp_irq(device, 0); + + if (data->wbase == 0 || data->ebase == 0 || + data->sbase == 0 || data->irq == 0) { + err = -ENODEV; + dev_err(dev, "Invalid resources\n"); + goto exit_free_data; + } + + dev_dbg(&device->dev, "Found device " + "(w: 0x%lX, e: 0x%lX, s: 0x%lX, i: %u)\n", + data->wbase, data->ebase, data->sbase, data->irq); + + if (!request_region(data->wbase, WAKEUP_IOMEM_LEN, DRVNAME)) { + dev_err(dev, "Region 0x%lx-0x%lx already in use!\n", + data->wbase, data->wbase + WAKEUP_IOMEM_LEN - 1); + err = -EBUSY; + goto exit_free_data; + } + + if (!request_region(data->ebase, EHFUNC_IOMEM_LEN, DRVNAME)) { + dev_err(dev, "Region 0x%lx-0x%lx already in use!\n", + data->ebase, data->ebase + EHFUNC_IOMEM_LEN - 1); + err = -EBUSY; + goto exit_release_wbase; + } + + if (!request_region(data->sbase, SP_IOMEM_LEN, DRVNAME)) { + dev_err(dev, "Region 0x%lx-0x%lx already in use!\n", + data->sbase, data->sbase + SP_IOMEM_LEN - 1); + err = -EBUSY; + goto exit_release_ebase; + } + + err = request_irq(data->irq, wbcir_irq_handler, + IRQF_DISABLED, DRVNAME, device); + if (err) { + dev_err(dev, "Failed to claim IRQ %u\n", data->irq); + err = -EBUSY; + goto exit_release_sbase; + } + + led_trigger_register_simple("cir-tx", &data->txtrigger); + if (!data->txtrigger) { + err = -ENOMEM; + goto exit_free_irq; + } + + led_trigger_register_simple("cir-rx", &data->rxtrigger); + if (!data->rxtrigger) { + err = -ENOMEM; + goto exit_unregister_txtrigger; + } + + data->led.name = "cir::activity"; + data->led.default_trigger = "cir-rx"; + data->led.brightness_set = wbcir_led_brightness_set; + data->led.brightness_get = wbcir_led_brightness_get; + err = led_classdev_register(&device->dev, &data->led); + if (err) + goto exit_unregister_rxtrigger; + + data->input_dev = input_allocate_device(); + if (!data->input_dev) { + err = -ENOMEM; + goto exit_unregister_led; + } + + data->input_dev->evbit[0] = BIT(EV_KEY); + data->input_dev->name = WBCIR_NAME; + data->input_dev->phys = "wbcir/cir0"; + data->input_dev->id.bustype = BUS_HOST; + data->input_dev->id.vendor = PCI_VENDOR_ID_WINBOND; + data->input_dev->id.product = WBCIR_ID_FAMILY; + data->input_dev->id.version = WBCIR_ID_CHIP; + data->input_dev->getkeycode = wbcir_getkeycode; + data->input_dev->setkeycode = wbcir_setkeycode; + input_set_capability(data->input_dev, EV_MSC, MSC_SCAN); + input_set_drvdata(data->input_dev, data); + + err = input_register_device(data->input_dev); + if (err) + goto exit_free_input; + + data->last_scancode = INVALID_SCANCODE; + INIT_LIST_HEAD(&data->keytable); + setup_timer(&data->timer_keyup, wbcir_keyup, (unsigned long)data); + + /* Load default keymaps */ + if (protocol == IR_PROTOCOL_RC6) { + int i; + for (i = 0; i < ARRAY_SIZE(rc6_def_keymap); i++) { + err = wbcir_setkeycode(data->input_dev, + (int)rc6_def_keymap[i].scancode, + (int)rc6_def_keymap[i].keycode); + if (err) + goto exit_unregister_keys; + } + } + + device_init_wakeup(&device->dev, 1); + + wbcir_cfg_ceir(data); + + /* Disable interrupts */ + wbcir_select_bank(data, WBCIR_BANK_0); + outb(WBCIR_IRQ_NONE, data->sbase + WBCIR_REG_SP3_IER); + + /* Enable extended mode */ + wbcir_select_bank(data, WBCIR_BANK_2); + outb(WBCIR_EXT_ENABLE, data->sbase + WBCIR_REG_SP3_EXCR1); + + /* + * Configure baud generator, IR data will be sampled at + * a bitrate of: (24Mhz * prescaler) / (divisor * 16). + * + * The ECIR registers include a flag to change the + * 24Mhz clock freq to 48Mhz. + * + * It's not documented in the specs, but fifo levels + * other than 16 seems to be unsupported. + */ + + /* prescaler 1.0, tx/rx fifo lvl 16 */ + outb(0x30, data->sbase + WBCIR_REG_SP3_EXCR2); + + /* Set baud divisor to generate one byte per bit/cell */ + switch (protocol) { + case IR_PROTOCOL_RC5: + outb(0xA7, data->sbase + WBCIR_REG_SP3_BGDL); + break; + case IR_PROTOCOL_RC6: + outb(0x53, data->sbase + WBCIR_REG_SP3_BGDL); + break; + case IR_PROTOCOL_NEC: + outb(0x69, data->sbase + WBCIR_REG_SP3_BGDL); + break; + } + outb(0x00, data->sbase + WBCIR_REG_SP3_BGDH); + + /* Set CEIR mode */ + wbcir_select_bank(data, WBCIR_BANK_0); + outb(0xC0, data->sbase + WBCIR_REG_SP3_MCR); + inb(data->sbase + WBCIR_REG_SP3_LSR); /* Clear LSR */ + inb(data->sbase + WBCIR_REG_SP3_MSR); /* Clear MSR */ + + /* Disable RX demod, run-length encoding/decoding, set freq span */ + wbcir_select_bank(data, WBCIR_BANK_7); + outb(0x10, data->sbase + WBCIR_REG_SP3_RCCFG); + + /* Disable timer */ + wbcir_select_bank(data, WBCIR_BANK_4); + outb(0x00, data->sbase + WBCIR_REG_SP3_IRCR1); + + /* Enable MSR interrupt, Clear AUX_IRX */ + wbcir_select_bank(data, WBCIR_BANK_5); + outb(0x00, data->sbase + WBCIR_REG_SP3_IRCR2); + + /* Disable CRC */ + wbcir_select_bank(data, WBCIR_BANK_6); + outb(0x20, data->sbase + WBCIR_REG_SP3_IRCR3); + + /* Set RX/TX (de)modulation freq, not really used */ + wbcir_select_bank(data, WBCIR_BANK_7); + outb(0xF2, data->sbase + WBCIR_REG_SP3_IRRXDC); + outb(0x69, data->sbase + WBCIR_REG_SP3_IRTXMC); + + /* Set invert and pin direction */ + if (invert) + outb(0x10, data->sbase + WBCIR_REG_SP3_IRCFG4); + else + outb(0x00, data->sbase + WBCIR_REG_SP3_IRCFG4); + + /* Set FIFO thresholds (RX = 8, TX = 3), reset RX/TX */ + wbcir_select_bank(data, WBCIR_BANK_0); + outb(0x97, data->sbase + WBCIR_REG_SP3_FCR); + + /* Clear AUX status bits */ + outb(0xE0, data->sbase + WBCIR_REG_SP3_ASCR); + + /* Enable interrupts */ + outb(WBCIR_IRQ_RX | WBCIR_IRQ_ERR, data->sbase + WBCIR_REG_SP3_IER); + + return 0; + +exit_unregister_keys: + if (!list_empty(&data->keytable)) { + struct wbcir_keyentry *key; + struct wbcir_keyentry *keytmp; + + list_for_each_entry_safe(key, keytmp, &data->keytable, list) { + list_del(&key->list); + kfree(key); + } + } + input_unregister_device(data->input_dev); + /* Can't call input_free_device on an unregistered device */ + data->input_dev = NULL; +exit_free_input: + input_free_device(data->input_dev); +exit_unregister_led: + led_classdev_unregister(&data->led); +exit_unregister_rxtrigger: + led_trigger_unregister_simple(data->rxtrigger); +exit_unregister_txtrigger: + led_trigger_unregister_simple(data->txtrigger); +exit_free_irq: + free_irq(data->irq, device); +exit_release_sbase: + release_region(data->sbase, SP_IOMEM_LEN); +exit_release_ebase: + release_region(data->ebase, EHFUNC_IOMEM_LEN); +exit_release_wbase: + release_region(data->wbase, WAKEUP_IOMEM_LEN); +exit_free_data: + kfree(data); + pnp_set_drvdata(device, NULL); +exit: + return err; +} + +static void __devexit +wbcir_remove(struct pnp_dev *device) +{ + struct wbcir_data *data = pnp_get_drvdata(device); + struct wbcir_keyentry *key; + struct wbcir_keyentry *keytmp; + + /* Disable interrupts */ + wbcir_select_bank(data, WBCIR_BANK_0); + outb(WBCIR_IRQ_NONE, data->sbase + WBCIR_REG_SP3_IER); + + del_timer_sync(&data->timer_keyup); + + free_irq(data->irq, device); + + /* Clear status bits NEC_REP, BUFF, MSG_END, MATCH */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_STS, 0x17, 0x17); + + /* Clear CEIR_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_CTL, 0x00, 0x01); + + /* Clear BUFF_EN, END_EN, MATCH_EN */ + wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_EV_EN, 0x00, 0x07); + + /* This will generate a keyup event if necessary */ + input_unregister_device(data->input_dev); + + led_trigger_unregister_simple(data->rxtrigger); + led_trigger_unregister_simple(data->txtrigger); + led_classdev_unregister(&data->led); + + /* This is ok since &data->led isn't actually used */ + wbcir_led_brightness_set(&data->led, LED_OFF); + + release_region(data->wbase, WAKEUP_IOMEM_LEN); + release_region(data->ebase, EHFUNC_IOMEM_LEN); + release_region(data->sbase, SP_IOMEM_LEN); + + list_for_each_entry_safe(key, keytmp, &data->keytable, list) { + list_del(&key->list); + kfree(key); + } + + kfree(data); + + pnp_set_drvdata(device, NULL); +} + +static const struct pnp_device_id wbcir_ids[] = { + { "WEC1022", 0 }, + { "", 0 } +}; +MODULE_DEVICE_TABLE(pnp, wbcir_ids); + +static struct pnp_driver wbcir_driver = { + .name = WBCIR_NAME, + .id_table = wbcir_ids, + .probe = wbcir_probe, + .remove = __devexit_p(wbcir_remove), + .suspend = wbcir_suspend, + .resume = wbcir_resume, + .shutdown = wbcir_shutdown +}; + +static int __init +wbcir_init(void) +{ + int ret; + + switch (protocol) { + case IR_PROTOCOL_RC5: + case IR_PROTOCOL_NEC: + case IR_PROTOCOL_RC6: + break; + default: + printk(KERN_ERR DRVNAME ": Invalid protocol argument\n"); + return -EINVAL; + } + + ret = pnp_register_driver(&wbcir_driver); + if (ret) + printk(KERN_ERR DRVNAME ": Unable to register driver\n"); + + return ret; +} + +static void __exit +wbcir_exit(void) +{ + pnp_unregister_driver(&wbcir_driver); +} + +MODULE_AUTHOR("David Härdeman <david@hardeman.nu>"); +MODULE_DESCRIPTION("Winbond SuperI/O Consumer IR Driver"); +MODULE_LICENSE("GPL"); + +module_init(wbcir_init); +module_exit(wbcir_exit); + + diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c index bff72d81f26..9f8f67b6c07 100644 --- a/drivers/isdn/capi/capifs.c +++ b/drivers/isdn/capi/capifs.c @@ -89,7 +89,7 @@ static int capifs_remount(struct super_block *s, int *flags, char *data) return 0; } -static struct super_operations capifs_sops = +static const struct super_operations capifs_sops = { .statfs = simple_statfs, .remount_fs = capifs_remount, diff --git a/drivers/isdn/capi/capiutil.c b/drivers/isdn/capi/capiutil.c index 16f2e465e5f..26626eead82 100644 --- a/drivers/isdn/capi/capiutil.c +++ b/drivers/isdn/capi/capiutil.c @@ -1019,7 +1019,7 @@ int __init cdebug_init(void) if (!g_debbuf->buf) { kfree(g_cmsg); kfree(g_debbuf); - return -ENOMEM;; + return -ENOMEM; } g_debbuf->size = CDEBUG_GSIZE; g_debbuf->buf[0] = 0; diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c index 7188c59a76f..adb1e8c36b4 100644 --- a/drivers/isdn/i4l/isdn_common.c +++ b/drivers/isdn/i4l/isdn_common.c @@ -761,7 +761,7 @@ isdn_getnum(char **p) * Be aware that this is not an atomic operation when sleep != 0, even though * interrupts are turned off! Well, like that we are currently only called * on behalf of a read system call on raw device files (which are documented - * to be dangerous and for for debugging purpose only). The inode semaphore + * to be dangerous and for debugging purpose only). The inode semaphore * takes care that this is not called for the same minor device number while * we are sleeping, but access is not serialized against simultaneous read() * from the corresponding ttyI device. Can other ugly events, like changes @@ -873,7 +873,7 @@ isdn_readbchan(int di, int channel, u_char * buf, u_char * fp, int len, wait_que * Be aware that this is not an atomic operation when sleep != 0, even though * interrupts are turned off! Well, like that we are currently only called * on behalf of a read system call on raw device files (which are documented - * to be dangerous and for for debugging purpose only). The inode semaphore + * to be dangerous and for debugging purpose only). The inode semaphore * takes care that this is not called for the same minor device number while * we are sleeping, but access is not serialized against simultaneous read() * from the corresponding ttyI device. Can other ugly events, like changes diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a8d0aee3bc0..8aaad65c3bb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -894,7 +894,7 @@ void guest_set_pte(struct lg_cpu *cpu, * tells us they've changed. When the Guest tries to use the new entry it will * fault and demand_page() will fix it up. * - * So with that in mind here's our code to to update a (top-level) PGD entry: + * So with that in mind here's our code to update a (top-level) PGD entry: */ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index a98ab72adf9..93fb32038b1 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -274,7 +274,7 @@ static void __devinit rackmeter_init_cpu_sniffer(struct rackmeter *rm) if (cpu > 1) continue; - rcpu = &rm->cpu[cpu];; + rcpu = &rm->cpu[cpu]; rcpu->prev_idle = get_cpu_idle_time(cpu); rcpu->prev_wall = jiffies64_to_cputime64(get_jiffies_64()); schedule_delayed_work_on(cpu, &rm->cpu[cpu].sniffer, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index eee28fac210..376f1ab48a2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1716,7 +1716,7 @@ out: return r; } -static struct block_device_operations dm_blk_dops; +static const struct block_device_operations dm_blk_dops; static void dm_wq_work(struct work_struct *work); @@ -2663,7 +2663,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) kfree(pools); } -static struct block_device_operations dm_blk_dops = { +static const struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, diff --git a/drivers/md/md.c b/drivers/md/md.c index 9dd872000ce..6aa497e4baf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -138,7 +138,7 @@ static ctl_table raid_root_table[] = { { .ctl_name = 0 } }; -static struct block_device_operations md_fops; +static const struct block_device_operations md_fops; static int start_readonly; @@ -5556,7 +5556,7 @@ static int md_revalidate(struct gendisk *disk) mddev->changed = 0; return 0; } -static struct block_device_operations md_fops = +static const struct block_device_operations md_fops = { .owner = THIS_MODULE, .open = md_open, diff --git a/drivers/md/md.h b/drivers/md/md.h index f8fc188bc76..f55d2ff9513 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -201,7 +201,7 @@ struct mddev_s * INTR: resync needs to be aborted for some reason * DONE: thread is done and is waiting to be reaped * REQUEST: user-space has requested a sync (used with SYNC) - * CHECK: user-space request for for check-only, no repair + * CHECK: user-space request for check-only, no repair * RESHAPE: A reshape is happening * * If neither SYNC or RESHAPE are set, then it is a recovery. diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 89e76819f61..d2d3fd54cc6 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -150,6 +150,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) } mp_bh = mempool_alloc(conf->pool, GFP_NOIO); + memset(mp_bh, 0, sizeof(*mp_bh)); mp_bh->master_bio = bio; mp_bh->mddev = mddev; @@ -493,7 +494,7 @@ static int multipath_run (mddev_t *mddev) } mddev->degraded = conf->raid_disks - conf->working_disks; - conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, + conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); if (conf->pool == NULL) { printk(KERN_ERR diff --git a/drivers/media/dvb/pt1/pt1.c b/drivers/media/dvb/pt1/pt1.c index 8ffbcecad93..81e623a90f0 100644 --- a/drivers/media/dvb/pt1/pt1.c +++ b/drivers/media/dvb/pt1/pt1.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/module.h> +#include <linux/vmalloc.h> #include <linux/pci.h> #include <linux/kthread.h> #include <linux/freezer.h> diff --git a/drivers/media/dvb/siano/smscoreapi.c b/drivers/media/dvb/siano/smscoreapi.c index bd9ab9d0d12..fa6a62369a7 100644 --- a/drivers/media/dvb/siano/smscoreapi.c +++ b/drivers/media/dvb/siano/smscoreapi.c @@ -1367,7 +1367,7 @@ int smscore_set_gpio(struct smscore_device_t *coredev, u32 pin, int level) &msg, sizeof(msg)); } -/* new GPIO managment implementation */ +/* new GPIO management implementation */ static int GetGpioPinParams(u32 PinNum, u32 *pTranslatedPinNum, u32 *pGroupNum, u32 *pGroupCfg) { diff --git a/drivers/media/dvb/siano/smscoreapi.h b/drivers/media/dvb/siano/smscoreapi.h index f1108c64e89..eec18aaf551 100644 --- a/drivers/media/dvb/siano/smscoreapi.h +++ b/drivers/media/dvb/siano/smscoreapi.h @@ -657,12 +657,12 @@ struct smscore_buffer_t *smscore_getbuffer(struct smscore_device_t *coredev); extern void smscore_putbuffer(struct smscore_device_t *coredev, struct smscore_buffer_t *cb); -/* old GPIO managment */ +/* old GPIO management */ int smscore_configure_gpio(struct smscore_device_t *coredev, u32 pin, struct smscore_config_gpio *pinconfig); int smscore_set_gpio(struct smscore_device_t *coredev, u32 pin, int level); -/* new GPIO managment */ +/* new GPIO management */ extern int smscore_gpio_configure(struct smscore_device_t *coredev, u8 PinNum, struct smscore_gpio_config *pGpioConfig); extern int smscore_gpio_set_level(struct smscore_device_t *coredev, u8 PinNum, diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c index 575bf9d8941..a1239083472 100644 --- a/drivers/media/radio/radio-mr800.c +++ b/drivers/media/radio/radio-mr800.c @@ -46,7 +46,7 @@ * Version 0.11: Converted to v4l2_device. * * Many things to do: - * - Correct power managment of device (suspend & resume) + * - Correct power management of device (suspend & resume) * - Add code for scanning and smooth tuning * - Add code for sensitivity value * - Correct mistakes diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c index 356d6896da3..fbdc1cde56a 100644 --- a/drivers/media/video/cx88/cx88-blackbird.c +++ b/drivers/media/video/cx88/cx88-blackbird.c @@ -1371,7 +1371,7 @@ static struct cx8802_driver cx8802_blackbird_driver = { .advise_release = cx8802_blackbird_advise_release, }; -static int blackbird_init(void) +static int __init blackbird_init(void) { printk(KERN_INFO "cx2388x blackbird driver version %d.%d.%d loaded\n", (CX88_VERSION_CODE >> 16) & 0xff, @@ -1384,7 +1384,7 @@ static int blackbird_init(void) return cx8802_register_driver(&cx8802_blackbird_driver); } -static void blackbird_fini(void) +static void __exit blackbird_fini(void) { cx8802_unregister_driver(&cx8802_blackbird_driver); } diff --git a/drivers/media/video/cx88/cx88-dvb.c b/drivers/media/video/cx88/cx88-dvb.c index 6e5d142b5b0..518bcfe18bc 100644 --- a/drivers/media/video/cx88/cx88-dvb.c +++ b/drivers/media/video/cx88/cx88-dvb.c @@ -1350,7 +1350,7 @@ static struct cx8802_driver cx8802_dvb_driver = { .advise_release = cx8802_dvb_advise_release, }; -static int dvb_init(void) +static int __init dvb_init(void) { printk(KERN_INFO "cx88/2: cx2388x dvb driver version %d.%d.%d loaded\n", (CX88_VERSION_CODE >> 16) & 0xff, @@ -1363,7 +1363,7 @@ static int dvb_init(void) return cx8802_register_driver(&cx8802_dvb_driver); } -static void dvb_fini(void) +static void __exit dvb_fini(void) { cx8802_unregister_driver(&cx8802_dvb_driver); } diff --git a/drivers/media/video/cx88/cx88-mpeg.c b/drivers/media/video/cx88/cx88-mpeg.c index 7172dcf2a4f..de9ff0fc741 100644 --- a/drivers/media/video/cx88/cx88-mpeg.c +++ b/drivers/media/video/cx88/cx88-mpeg.c @@ -870,7 +870,7 @@ static struct pci_driver cx8802_pci_driver = { .remove = __devexit_p(cx8802_remove), }; -static int cx8802_init(void) +static int __init cx8802_init(void) { printk(KERN_INFO "cx88/2: cx2388x MPEG-TS Driver Manager version %d.%d.%d loaded\n", (CX88_VERSION_CODE >> 16) & 0xff, @@ -883,7 +883,7 @@ static int cx8802_init(void) return pci_register_driver(&cx8802_pci_driver); } -static void cx8802_fini(void) +static void __exit cx8802_fini(void) { pci_unregister_driver(&cx8802_pci_driver); } diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c index 81d2b5dea18..57e6b124109 100644 --- a/drivers/media/video/cx88/cx88-video.c +++ b/drivers/media/video/cx88/cx88-video.c @@ -2113,7 +2113,7 @@ static struct pci_driver cx8800_pci_driver = { #endif }; -static int cx8800_init(void) +static int __init cx8800_init(void) { printk(KERN_INFO "cx88/0: cx2388x v4l2 driver version %d.%d.%d loaded\n", (CX88_VERSION_CODE >> 16) & 0xff, @@ -2126,7 +2126,7 @@ static int cx8800_init(void) return pci_register_driver(&cx8800_pci_driver); } -static void cx8800_fini(void) +static void __exit cx8800_fini(void) { pci_unregister_driver(&cx8800_pci_driver); } diff --git a/drivers/media/video/gspca/m5602/m5602_core.c b/drivers/media/video/gspca/m5602/m5602_core.c index 8a5bba16ff3..7f1e5415850 100644 --- a/drivers/media/video/gspca/m5602/m5602_core.c +++ b/drivers/media/video/gspca/m5602/m5602_core.c @@ -56,7 +56,7 @@ int m5602_read_bridge(struct sd *sd, const u8 address, u8 *i2c_data) return (err < 0) ? err : 0; } -/* Writes a byte to to the m5602 */ +/* Writes a byte to the m5602 */ int m5602_write_bridge(struct sd *sd, const u8 address, const u8 i2c_data) { int err; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 7847bbc1440..bd83fa0a497 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -235,7 +235,7 @@ static int mspro_block_bd_getgeo(struct block_device *bdev, return 0; } -static struct block_device_operations ms_block_bdops = { +static const struct block_device_operations ms_block_bdops = { .open = mspro_block_bd_open, .release = mspro_block_bd_release, .getgeo = mspro_block_bd_getgeo, diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 76fa2ee0b57..610e914abe6 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -6821,7 +6821,7 @@ mpt_print_ioc_summary(MPT_ADAPTER *ioc, char *buffer, int *size, int len, int sh *size = y; } /** - * mpt_set_taskmgmt_in_progress_flag - set flags associated with task managment + * mpt_set_taskmgmt_in_progress_flag - set flags associated with task management * @ioc: Pointer to MPT_ADAPTER structure * * Returns 0 for SUCCESS or -1 if FAILED. @@ -6854,7 +6854,7 @@ mpt_set_taskmgmt_in_progress_flag(MPT_ADAPTER *ioc) EXPORT_SYMBOL(mpt_set_taskmgmt_in_progress_flag); /** - * mpt_clear_taskmgmt_in_progress_flag - clear flags associated with task managment + * mpt_clear_taskmgmt_in_progress_flag - clear flags associated with task management * @ioc: Pointer to MPT_ADAPTER structure * **/ diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 335d4c78a77..d505b68cd37 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -925,7 +925,7 @@ static void i2o_block_request_fn(struct request_queue *q) }; /* I2O Block device operations definition */ -static struct block_device_operations i2o_block_fops = { +static const struct block_device_operations i2o_block_fops = { .owner = THIS_MODULE, .open = i2o_block_open, .release = i2o_block_release, diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index c533f86ff5e..5447da16a17 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -647,7 +647,7 @@ struct ab3100_init_setting { u8 setting; }; -static const struct ab3100_init_setting __initdata +static const struct ab3100_init_setting __initconst ab3100_init_settings[] = { { .abreg = AB3100_MCA, diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index de966a6fb7e..aecf40ecb3a 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -97,7 +97,7 @@ static int ibmasmfs_get_super(struct file_system_type *fst, return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt); } -static struct super_operations ibmasmfs_s_ops = { +static const struct super_operations ibmasmfs_s_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index adc205c49fb..85f0e8cd875 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -130,7 +130,7 @@ mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static struct block_device_operations mmc_bdops = { +static const struct block_device_operations mmc_bdops = { .open = mmc_blk_open, .release = mmc_blk_release, .getgeo = mmc_blk_getgeo, diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index bc14bb1b057..88671529c45 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -512,7 +512,7 @@ static void mxcmci_cmd_done(struct mxcmci_host *host, unsigned int stat) } /* For the DMA case the DMA engine handles the data transfer - * automatically. For non DMA we have to to it ourselves. + * automatically. For non DMA we have to do it ourselves. * Don't do it in interrupt context though. */ if (!mxcmci_use_dma(host) && host->data) diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 10ed195c0c1..eb495d83064 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -776,13 +776,13 @@ static struct spi_driver m25p80_driver = { }; -static int m25p80_init(void) +static int __init m25p80_init(void) { return spi_register_driver(&m25p80_driver); } -static void m25p80_exit(void) +static void __exit m25p80_exit(void) { spi_unregister_driver(&m25p80_driver); } diff --git a/drivers/mtd/devices/slram.c b/drivers/mtd/devices/slram.c index 00248e81ecd..7d846e9173d 100644 --- a/drivers/mtd/devices/slram.c +++ b/drivers/mtd/devices/slram.c @@ -303,7 +303,7 @@ __setup("slram=", mtd_slram_setup); #endif -static int init_slram(void) +static int __init init_slram(void) { char *devname; int i; diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c index a790c062af1..e56d6b42f02 100644 --- a/drivers/mtd/ftl.c +++ b/drivers/mtd/ftl.c @@ -1099,7 +1099,7 @@ static struct mtd_blktrans_ops ftl_tr = { .owner = THIS_MODULE, }; -static int init_ftl(void) +static int __init init_ftl(void) { return register_mtd_blktrans(&ftl_tr); } diff --git a/drivers/mtd/maps/ixp2000.c b/drivers/mtd/maps/ixp2000.c index d4fb9a3ab4d..1bdf0ee6d0b 100644 --- a/drivers/mtd/maps/ixp2000.c +++ b/drivers/mtd/maps/ixp2000.c @@ -184,7 +184,7 @@ static int ixp2000_flash_probe(struct platform_device *dev) info->map.bankwidth = 1; /* - * map_priv_2 is used to store a ptr to to the bank_setup routine + * map_priv_2 is used to store a ptr to the bank_setup routine */ info->map.map_priv_2 = (unsigned long) ixp_data->bank_setup; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 7baba40c1ed..0acbf4f5be5 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -210,7 +210,7 @@ static int blktrans_ioctl(struct block_device *bdev, fmode_t mode, } } -static struct block_device_operations mtd_blktrans_ops = { +static const struct block_device_operations mtd_blktrans_ops = { .owner = THIS_MODULE, .open = blktrans_open, .release = blktrans_release, diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c index 29acd06b1c3..1b4690bdfdb 100644 --- a/drivers/mtd/nand/cafe_nand.c +++ b/drivers/mtd/nand/cafe_nand.c @@ -903,12 +903,12 @@ static struct pci_driver cafe_nand_pci_driver = { .resume = cafe_nand_resume, }; -static int cafe_nand_init(void) +static int __init cafe_nand_init(void) { return pci_register_driver(&cafe_nand_pci_driver); } -static void cafe_nand_exit(void) +static void __exit cafe_nand_exit(void) { pci_unregister_driver(&cafe_nand_pci_driver); } diff --git a/drivers/mtd/nand/cmx270_nand.c b/drivers/mtd/nand/cmx270_nand.c index 10081e656a6..826cacffcef 100644 --- a/drivers/mtd/nand/cmx270_nand.c +++ b/drivers/mtd/nand/cmx270_nand.c @@ -147,7 +147,7 @@ static int cmx270_device_ready(struct mtd_info *mtd) /* * Main initialization routine */ -static int cmx270_init(void) +static int __init cmx270_init(void) { struct nand_chip *this; const char *part_type; @@ -261,7 +261,7 @@ module_init(cmx270_init); /* * Clean up routine */ -static void cmx270_cleanup(void) +static void __exit cmx270_cleanup(void) { /* Release resources, unregister device */ nand_release(cmx270_nand_mtd); diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index e4d9ef0c965..9f87c99189a 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1065,7 +1065,7 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, } /* - * Now we have got to calculate how much data we have to to copy. In + * Now we have got to calculate how much data we have to copy. In * case of a static volume it is fairly easy - the VID header contains * the data size. In case of a dynamic volume it is more difficult - we * have to read the contents, cut 0xFF bytes from the end and copy only diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index c290f51dd17..1af08178def 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -570,7 +570,7 @@ void ubi_do_get_volume_info(struct ubi_device *ubi, struct ubi_volume *vol, /* * ubi_rb_for_each_entry - walk an RB-tree. - * @rb: a pointer to type 'struct rb_node' to to use as a loop counter + * @rb: a pointer to type 'struct rb_node' to use as a loop counter * @pos: a pointer to RB-tree entry type to use as a loop counter * @root: RB-tree's root * @member: the name of the 'struct rb_node' within the RB-tree entry diff --git a/drivers/net/arcnet/arc-rawmode.c b/drivers/net/arcnet/arc-rawmode.c index 646dfc5f50c..8ea9c7545c1 100644 --- a/drivers/net/arcnet/arc-rawmode.c +++ b/drivers/net/arcnet/arc-rawmode.c @@ -123,7 +123,6 @@ static void rx(struct net_device *dev, int bufnum, BUGLVL(D_SKB) arcnet_dump_skb(dev, skb, "rx"); skb->protocol = cpu_to_be16(ETH_P_ARCNET); -; netif_rx(skb); } diff --git a/drivers/net/arcnet/capmode.c b/drivers/net/arcnet/capmode.c index 083e21094b2..66bcbbb6bab 100644 --- a/drivers/net/arcnet/capmode.c +++ b/drivers/net/arcnet/capmode.c @@ -149,7 +149,6 @@ static void rx(struct net_device *dev, int bufnum, BUGLVL(D_SKB) arcnet_dump_skb(dev, skb, "rx"); skb->protocol = cpu_to_be16(ETH_P_ARCNET); -; netif_rx(skb); } diff --git a/drivers/net/bnx2x_reg.h b/drivers/net/bnx2x_reg.h index 0695be14cf9..aa76cbada5e 100644 --- a/drivers/net/bnx2x_reg.h +++ b/drivers/net/bnx2x_reg.h @@ -3122,7 +3122,7 @@ The fields are:[4:0] - tail pointer; [10:5] - Link List size; 15:11] - header pointer. */ #define TCM_REG_XX_TABLE 0x50240 -/* [RW 4] Load value for for cfc ac credit cnt. */ +/* [RW 4] Load value for cfc ac credit cnt. */ #define TM_REG_CFC_AC_CRDCNT_VAL 0x164208 /* [RW 4] Load value for cfc cld credit cnt. */ #define TM_REG_CFC_CLD_CRDCNT_VAL 0x164210 diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index cea5cfe23b7..c3fa31c9f2a 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1987,7 +1987,7 @@ void bond_3ad_unbind_slave(struct slave *slave) // find new aggregator for the related port(s) new_aggregator = __get_first_agg(port); for (; new_aggregator; new_aggregator = __get_next_agg(new_aggregator)) { - // if the new aggregator is empty, or it connected to to our port only + // if the new aggregator is empty, or it is connected to our port only if (!new_aggregator->lag_ports || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator)) { break; } diff --git a/drivers/net/e1000/e1000_hw.c b/drivers/net/e1000/e1000_hw.c index cda6b397550..45ac225a7aa 100644 --- a/drivers/net/e1000/e1000_hw.c +++ b/drivers/net/e1000/e1000_hw.c @@ -3035,7 +3035,7 @@ s32 e1000_check_for_link(struct e1000_hw *hw) /* If TBI compatibility is was previously off, turn it on. For * compatibility with a TBI link partner, we will store bad * packets. Some frames have an additional byte on the end and - * will look like CRC errors to to the hardware. + * will look like CRC errors to the hardware. */ if (!hw->tbi_compatibility_on) { hw->tbi_compatibility_on = true; diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c index 2234118eedb..6c144b525b4 100644 --- a/drivers/net/gianfar_ethtool.c +++ b/drivers/net/gianfar_ethtool.c @@ -293,7 +293,7 @@ static int gfar_gcoalesce(struct net_device *dev, struct ethtool_coalesce *cvals rxtime = get_ictt_value(priv->rxic); rxcount = get_icft_value(priv->rxic); txtime = get_ictt_value(priv->txic); - txcount = get_icft_value(priv->txic);; + txcount = get_icft_value(priv->txic); cvals->rx_coalesce_usecs = gfar_ticks2usecs(priv, rxtime); cvals->rx_max_coalesced_frames = rxcount; diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c index 1d7d7fef414..89c82c5e63e 100644 --- a/drivers/net/ibm_newemac/core.c +++ b/drivers/net/ibm_newemac/core.c @@ -2556,13 +2556,13 @@ static int __devinit emac_init_config(struct emac_instance *dev) if (emac_read_uint_prop(np, "mdio-device", &dev->mdio_ph, 0)) dev->mdio_ph = 0; if (emac_read_uint_prop(np, "zmii-device", &dev->zmii_ph, 0)) - dev->zmii_ph = 0;; + dev->zmii_ph = 0; if (emac_read_uint_prop(np, "zmii-channel", &dev->zmii_port, 0)) - dev->zmii_port = 0xffffffff;; + dev->zmii_port = 0xffffffff; if (emac_read_uint_prop(np, "rgmii-device", &dev->rgmii_ph, 0)) - dev->rgmii_ph = 0;; + dev->rgmii_ph = 0; if (emac_read_uint_prop(np, "rgmii-channel", &dev->rgmii_port, 0)) - dev->rgmii_port = 0xffffffff;; + dev->rgmii_port = 0xffffffff; if (emac_read_uint_prop(np, "fifo-entry-size", &dev->fifo_entry_size, 0)) dev->fifo_entry_size = 16; if (emac_read_uint_prop(np, "mal-burst-size", &dev->mal_burst_size, 0)) diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c index d2639c4a086..5d6c1530a8c 100644 --- a/drivers/net/igb/igb_main.c +++ b/drivers/net/igb/igb_main.c @@ -3966,7 +3966,7 @@ static int igb_set_vf_multicasts(struct igb_adapter *adapter, /* VFs are limited to using the MTA hash table for their multicast * addresses */ for (i = 0; i < n; i++) - vf_data->vf_mc_hashes[i] = hash_list[i];; + vf_data->vf_mc_hashes[i] = hash_list[i]; /* Flush and reset the mta with the new values */ igb_set_rx_mode(adapter->netdev); diff --git a/drivers/net/ll_temac_main.c b/drivers/net/ll_temac_main.c index da8d0a0ca94..f2a197fd47a 100644 --- a/drivers/net/ll_temac_main.c +++ b/drivers/net/ll_temac_main.c @@ -865,7 +865,7 @@ temac_of_probe(struct of_device *op, const struct of_device_id *match) dcrs = dcr_resource_start(np, 0); if (dcrs == 0) { dev_err(&op->dev, "could not get DMA register address\n"); - goto nodev;; + goto nodev; } lp->sdma_dcrs = dcr_map(np, dcrs, dcr_resource_len(np, 0)); dev_dbg(&op->dev, "DCR base: %x\n", dcrs); diff --git a/drivers/net/macb.c b/drivers/net/macb.c index fb65b427c69..1d0d4d9ab62 100644 --- a/drivers/net/macb.c +++ b/drivers/net/macb.c @@ -241,7 +241,7 @@ static int macb_mii_init(struct macb *bp) struct eth_platform_data *pdata; int err = -ENXIO, i; - /* Enable managment port */ + /* Enable management port */ macb_writel(bp, NCR, MACB_BIT(MPE)); bp->mii_bus = mdiobus_alloc(); diff --git a/drivers/net/ni52.c b/drivers/net/ni52.c index bd0ac690d12..aad3b370c56 100644 --- a/drivers/net/ni52.c +++ b/drivers/net/ni52.c @@ -615,10 +615,10 @@ static int init586(struct net_device *dev) /* addr_len |!src_insert |pre-len |loopback */ writeb(0x2e, &cfg_cmd->adr_len); writeb(0x00, &cfg_cmd->priority); - writeb(0x60, &cfg_cmd->ifs);; + writeb(0x60, &cfg_cmd->ifs); writeb(0x00, &cfg_cmd->time_low); writeb(0xf2, &cfg_cmd->time_high); - writeb(0x00, &cfg_cmd->promisc);; + writeb(0x00, &cfg_cmd->promisc); if (dev->flags & IFF_ALLMULTI) { int len = ((char __iomem *)p->iscp - (char __iomem *)ptr - 8) / 6; if (num_addrs > len) { diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c index 22052925782..7783c5db81d 100644 --- a/drivers/net/qlge/qlge_main.c +++ b/drivers/net/qlge/qlge_main.c @@ -2630,7 +2630,7 @@ static int ql_start_rx_ring(struct ql_adapter *qdev, struct rx_ring *rx_ring) FLAGS_LI; /* Load irq delay values */ if (rx_ring->lbq_len) { cqicb->flags |= FLAGS_LL; /* Load lbq values */ - tmp = (u64)rx_ring->lbq_base_dma;; + tmp = (u64)rx_ring->lbq_base_dma; base_indirect_ptr = (__le64 *) rx_ring->lbq_base_indirect; page_entries = 0; do { @@ -2654,7 +2654,7 @@ static int ql_start_rx_ring(struct ql_adapter *qdev, struct rx_ring *rx_ring) } if (rx_ring->sbq_len) { cqicb->flags |= FLAGS_LS; /* Load sbq values */ - tmp = (u64)rx_ring->sbq_base_dma;; + tmp = (u64)rx_ring->sbq_base_dma; base_indirect_ptr = (__le64 *) rx_ring->sbq_base_indirect; page_entries = 0; do { diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index bc98e7f69ee..ede937ee50c 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -72,7 +72,7 @@ static int rionet_check = 0; static int rionet_capable = 1; /* - * This is a fast lookup table for for translating TX + * This is a fast lookup table for translating TX * Ethernet packets into a destination RIO device. It * could be made into a hash table to save memory depending * on system trade-offs. diff --git a/drivers/net/skfp/pcmplc.c b/drivers/net/skfp/pcmplc.c index f1df2ec8ad4..e6b33ee05ed 100644 --- a/drivers/net/skfp/pcmplc.c +++ b/drivers/net/skfp/pcmplc.c @@ -960,7 +960,7 @@ static void pcm_fsm(struct s_smc *smc, struct s_phy *phy, int cmd) /*PC88b*/ if (!phy->cf_join) { phy->cf_join = TRUE ; - queue_event(smc,EVENT_CFM,CF_JOIN+np) ; ; + queue_event(smc,EVENT_CFM,CF_JOIN+np) ; } if (cmd == PC_JOIN) GO_STATE(PC8_ACTIVE) ; diff --git a/drivers/net/skfp/pmf.c b/drivers/net/skfp/pmf.c index 79e665e0853..a320fdb3727 100644 --- a/drivers/net/skfp/pmf.c +++ b/drivers/net/skfp/pmf.c @@ -807,9 +807,9 @@ void smt_add_para(struct s_smc *smc, struct s_pcon *pcon, u_short para, mib_p->fddiPORTLerFlag ; sp->p4050_pad = 0 ; sp->p4050_cutoff = - mib_p->fddiPORTLer_Cutoff ; ; + mib_p->fddiPORTLer_Cutoff ; sp->p4050_alarm = - mib_p->fddiPORTLer_Alarm ; ; + mib_p->fddiPORTLer_Alarm ; sp->p4050_estimate = mib_p->fddiPORTLer_Estimate ; sp->p4050_reject_ct = @@ -829,7 +829,7 @@ void smt_add_para(struct s_smc *smc, struct s_pcon *pcon, u_short para, sp->p4051_porttype = mib_p->fddiPORTMy_Type ; sp->p4051_connectstate = - mib_p->fddiPORTConnectState ; ; + mib_p->fddiPORTConnectState ; sp->p4051_pc_neighbor = mib_p->fddiPORTNeighborType ; sp->p4051_pc_withhold = @@ -853,7 +853,7 @@ void smt_add_para(struct s_smc *smc, struct s_pcon *pcon, u_short para, struct smt_p_4053 *sp ; sp = (struct smt_p_4053 *) to ; sp->p4053_multiple = - mib_p->fddiPORTMultiple_P ; ; + mib_p->fddiPORTMultiple_P ; sp->p4053_availablepaths = mib_p->fddiPORTAvailablePaths ; sp->p4053_currentpath = diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 62e852e21ab..55bad408196 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -215,7 +215,7 @@ static void skge_wol_init(struct skge_port *skge) if (skge->wol & WAKE_MAGIC) ctrl |= WOL_CTL_ENA_PME_ON_MAGIC_PKT|WOL_CTL_ENA_MAGIC_PKT_UNIT; else - ctrl |= WOL_CTL_DIS_PME_ON_MAGIC_PKT|WOL_CTL_DIS_MAGIC_PKT_UNIT;; + ctrl |= WOL_CTL_DIS_PME_ON_MAGIC_PKT|WOL_CTL_DIS_MAGIC_PKT_UNIT; ctrl |= WOL_CTL_DIS_PME_ON_PATTERN|WOL_CTL_DIS_PATTERN_UNIT; skge_write16(hw, WOL_REGS(port, WOL_CTRL_STAT), ctrl); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 4bb52e9cd37..15140f9f2e9 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -765,7 +765,7 @@ static void sky2_wol_init(struct sky2_port *sky2) if (sky2->wol & WAKE_MAGIC) ctrl |= WOL_CTL_ENA_PME_ON_MAGIC_PKT|WOL_CTL_ENA_MAGIC_PKT_UNIT; else - ctrl |= WOL_CTL_DIS_PME_ON_MAGIC_PKT|WOL_CTL_DIS_MAGIC_PKT_UNIT;; + ctrl |= WOL_CTL_DIS_PME_ON_MAGIC_PKT|WOL_CTL_DIS_MAGIC_PKT_UNIT; ctrl |= WOL_CTL_DIS_PME_ON_PATTERN|WOL_CTL_DIS_PATTERN_UNIT; sky2_write16(hw, WOL_REGS(port, WOL_CTRL_STAT), ctrl); diff --git a/drivers/net/vxge/vxge-config.h b/drivers/net/vxge/vxge-config.h index 62779a520ca..3e94f0ce090 100644 --- a/drivers/net/vxge/vxge-config.h +++ b/drivers/net/vxge/vxge-config.h @@ -1541,7 +1541,7 @@ void vxge_hw_ring_rxd_1b_info_get( rxd_info->l4_cksum_valid = (u32)VXGE_HW_RING_RXD_L4_CKSUM_CORRECT_GET(rxdp->control_0); rxd_info->l4_cksum = - (u32)VXGE_HW_RING_RXD_L4_CKSUM_GET(rxdp->control_0);; + (u32)VXGE_HW_RING_RXD_L4_CKSUM_GET(rxdp->control_0); rxd_info->frame = (u32)VXGE_HW_RING_RXD_ETHER_ENCAP_GET(rxdp->control_0); rxd_info->proto = diff --git a/drivers/net/vxge/vxge-main.c b/drivers/net/vxge/vxge-main.c index b378037a29b..068d7a9d3e3 100644 --- a/drivers/net/vxge/vxge-main.c +++ b/drivers/net/vxge/vxge-main.c @@ -2350,7 +2350,7 @@ static int vxge_enable_msix(struct vxgedev *vdev) enum vxge_hw_status status; /* 0 - Tx, 1 - Rx */ int tim_msix_id[4]; - int alarm_msix_id = 0, msix_intr_vect = 0;; + int alarm_msix_id = 0, msix_intr_vect = 0; vdev->intr_cnt = 0; /* allocate msix vectors */ diff --git a/drivers/net/wireless/ath/ath5k/reg.h b/drivers/net/wireless/ath/ath5k/reg.h index debad07d990..c63ea6afd96 100644 --- a/drivers/net/wireless/ath/ath5k/reg.h +++ b/drivers/net/wireless/ath/ath5k/reg.h @@ -982,7 +982,7 @@ #define AR5K_5414_CBCFG_BUF_DIS 0x10 /* Disable buffer */ /* - * PCI-E Power managment configuration + * PCI-E Power management configuration * and status register [5424+] */ #define AR5K_PCIE_PM_CTL 0x4068 /* Register address */ diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c index a3b36b3a9d6..cce188837d1 100644 --- a/drivers/net/wireless/atmel.c +++ b/drivers/net/wireless/atmel.c @@ -3330,7 +3330,7 @@ static void atmel_smooth_qual(struct atmel_private *priv) priv->wstats.qual.updated &= ~IW_QUAL_QUAL_INVALID; } -/* deals with incoming managment frames. */ +/* deals with incoming management frames. */ static void atmel_management_frame(struct atmel_private *priv, struct ieee80211_hdr *header, u16 frame_len, u8 rssi) diff --git a/drivers/net/wireless/zd1211rw/zd_chip.c b/drivers/net/wireless/zd1211rw/zd_chip.c index 5e110a2328a..4e79a980013 100644 --- a/drivers/net/wireless/zd1211rw/zd_chip.c +++ b/drivers/net/wireless/zd1211rw/zd_chip.c @@ -368,7 +368,7 @@ error: return r; } -/* MAC address: if custom mac addresses are to to be used CR_MAC_ADDR_P1 and +/* MAC address: if custom mac addresses are to be used CR_MAC_ADDR_P1 and * CR_MAC_ADDR_P2 must be overwritten */ int zd_write_mac_addr(struct zd_chip *chip, const u8 *mac_addr) diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index b7e4cee2426..2766a6d3c2e 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -35,7 +35,7 @@ static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode) } -static struct super_operations s_ops = { +static const struct super_operations s_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index a45b0c0d574..a6b4a5a53d4 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1266,7 +1266,7 @@ ccio_ioc_init(struct ioc *ioc) ** Hot-Plug/Removal of PCI cards. (aka PCI OLARD). */ - iova_space_size = (u32) (num_physpages / count_parisc_driver(&ccio_driver)); + iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver)); /* limit IOVA space size to 1MB-1GB */ @@ -1305,7 +1305,7 @@ ccio_ioc_init(struct ioc *ioc) DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_regs, - (unsigned long) num_physpages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 123d8fe3427..57a6d19eba4 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1390,7 +1390,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) ** for DMA hints - ergo only 30 bits max. */ - iova_space_size = (u32) (num_physpages/global_ioc_cnt); + iova_space_size = (u32) (totalram_pages/global_ioc_cnt); /* limit IOVA space size to 1MB-1GB */ if (iova_space_size < (1 << (20 - PAGE_SHIFT))) { @@ -1415,7 +1415,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, - (unsigned long) num_physpages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/pcmcia/pcmcia_ioctl.c b/drivers/pcmcia/pcmcia_ioctl.c index 7b424e0b044..32c44040c1e 100644 --- a/drivers/pcmcia/pcmcia_ioctl.c +++ b/drivers/pcmcia/pcmcia_ioctl.c @@ -27,6 +27,7 @@ #include <linux/proc_fs.h> #include <linux/poll.h> #include <linux/pci.h> +#include <linux/seq_file.h> #include <linux/smp_lock.h> #include <linux/workqueue.h> @@ -105,37 +106,40 @@ static struct pcmcia_driver *get_pcmcia_driver(dev_info_t *dev_info) #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_pccard = NULL; -static int proc_read_drivers_callback(struct device_driver *driver, void *d) +static int proc_read_drivers_callback(struct device_driver *driver, void *_m) { - char **p = d; + struct seq_file *m = _m; struct pcmcia_driver *p_drv = container_of(driver, struct pcmcia_driver, drv); - *p += sprintf(*p, "%-24.24s 1 %d\n", p_drv->drv.name, + seq_printf(m, "%-24.24s 1 %d\n", p_drv->drv.name, #ifdef CONFIG_MODULE_UNLOAD (p_drv->owner) ? module_refcount(p_drv->owner) : 1 #else 1 #endif ); - d = (void *) p; - return 0; } -static int proc_read_drivers(char *buf, char **start, off_t pos, - int count, int *eof, void *data) +static int pccard_drivers_proc_show(struct seq_file *m, void *v) { - char *p = buf; - int rc; - - rc = bus_for_each_drv(&pcmcia_bus_type, NULL, - (void *) &p, proc_read_drivers_callback); - if (rc < 0) - return rc; + return bus_for_each_drv(&pcmcia_bus_type, NULL, + m, proc_read_drivers_callback); +} - return (p - buf); +static int pccard_drivers_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, pccard_drivers_proc_show, NULL); } + +static const struct file_operations pccard_drivers_proc_fops = { + .owner = THIS_MODULE, + .open = pccard_drivers_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif @@ -1011,7 +1015,7 @@ void __init pcmcia_setup_ioctl(void) { #ifdef CONFIG_PROC_FS proc_pccard = proc_mkdir("bus/pccard", NULL); if (proc_pccard) - create_proc_read_entry("drivers",0,proc_pccard,proc_read_drivers,NULL); + proc_create("drivers", 0, proc_pccard, &pccard_drivers_proc_fops); #endif } diff --git a/drivers/pcmcia/sa1100_jornada720.c b/drivers/pcmcia/sa1100_jornada720.c index 57ca085473d..7eedb42f800 100644 --- a/drivers/pcmcia/sa1100_jornada720.c +++ b/drivers/pcmcia/sa1100_jornada720.c @@ -16,89 +16,103 @@ #include "sa1111_generic.h" -#define SOCKET0_POWER GPIO_GPIO0 -#define SOCKET0_3V GPIO_GPIO2 -#define SOCKET1_POWER (GPIO_GPIO1 | GPIO_GPIO3) -#warning *** Does SOCKET1_3V actually do anything? +/* Does SOCKET1_3V actually do anything? */ +#define SOCKET0_POWER GPIO_GPIO0 +#define SOCKET0_3V GPIO_GPIO2 +#define SOCKET1_POWER (GPIO_GPIO1 | GPIO_GPIO3) #define SOCKET1_3V GPIO_GPIO3 static int jornada720_pcmcia_hw_init(struct soc_pcmcia_socket *skt) { - /* - * What is all this crap for? - */ - GRER |= 0x00000002; - /* Set GPIO_A<3:1> to be outputs for PCMCIA/CF power controller: */ - sa1111_set_io_dir(SA1111_DEV(skt->dev), GPIO_A0|GPIO_A1|GPIO_A2|GPIO_A3, 0, 0); - sa1111_set_io(SA1111_DEV(skt->dev), GPIO_A0|GPIO_A1|GPIO_A2|GPIO_A3, 0); - sa1111_set_sleep_io(SA1111_DEV(skt->dev), GPIO_A0|GPIO_A1|GPIO_A2|GPIO_A3, 0); - - return sa1111_pcmcia_hw_init(skt); + unsigned int pin = GPIO_A0 | GPIO_A1 | GPIO_A2 | GPIO_A3; + + /* + * What is all this crap for? + */ + GRER |= 0x00000002; + /* Set GPIO_A<3:1> to be outputs for PCMCIA/CF power controller: */ + sa1111_set_io_dir(SA1111_DEV(skt->dev), pin, 0, 0); + sa1111_set_io(SA1111_DEV(skt->dev), pin, 0); + sa1111_set_sleep_io(SA1111_DEV(skt->dev), pin, 0); + + return sa1111_pcmcia_hw_init(skt); } static int jornada720_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, const socket_state_t *state) { - unsigned int pa_dwr_mask, pa_dwr_set; - int ret; - -printk("%s(): config socket %d vcc %d vpp %d\n", __func__, - skt->nr, state->Vcc, state->Vpp); - - switch (skt->nr) { - case 0: - pa_dwr_mask = SOCKET0_POWER | SOCKET0_3V; - - switch (state->Vcc) { - default: - case 0: pa_dwr_set = 0; break; - case 33: pa_dwr_set = SOCKET0_POWER | SOCKET0_3V; break; - case 50: pa_dwr_set = SOCKET0_POWER; break; - } - break; - - case 1: - pa_dwr_mask = SOCKET1_POWER; - - switch (state->Vcc) { - default: - case 0: pa_dwr_set = 0; break; - case 33: pa_dwr_set = SOCKET1_POWER; break; - case 50: pa_dwr_set = SOCKET1_POWER; break; - } - break; - - default: - return -1; - } - - if (state->Vpp != state->Vcc && state->Vpp != 0) { - printk(KERN_ERR "%s(): slot cannot support VPP %u\n", - __func__, state->Vpp); - return -1; - } - - ret = sa1111_pcmcia_configure_socket(skt, state); - if (ret == 0) { - unsigned long flags; - - local_irq_save(flags); - sa1111_set_io(SA1111_DEV(skt->dev), pa_dwr_mask, pa_dwr_set); - local_irq_restore(flags); - } - - return ret; + unsigned int pa_dwr_mask, pa_dwr_set; + int ret; + + printk(KERN_INFO "%s(): config socket %d vcc %d vpp %d\n", __func__, + skt->nr, state->Vcc, state->Vpp); + + switch (skt->nr) { + case 0: + pa_dwr_mask = SOCKET0_POWER | SOCKET0_3V; + + switch (state->Vcc) { + default: + case 0: + pa_dwr_set = 0; + break; + case 33: + pa_dwr_set = SOCKET0_POWER | SOCKET0_3V; + break; + case 50: + pa_dwr_set = SOCKET0_POWER; + break; + } + break; + + case 1: + pa_dwr_mask = SOCKET1_POWER; + + switch (state->Vcc) { + default: + case 0: + pa_dwr_set = 0; + break; + case 33: + pa_dwr_set = SOCKET1_POWER; + break; + case 50: + pa_dwr_set = SOCKET1_POWER; + break; + } + break; + + default: + return -1; + } + + if (state->Vpp != state->Vcc && state->Vpp != 0) { + printk(KERN_ERR "%s(): slot cannot support VPP %u\n", + __func__, state->Vpp); + return -EPERM; + } + + ret = sa1111_pcmcia_configure_socket(skt, state); + if (ret == 0) { + unsigned long flags; + + local_irq_save(flags); + sa1111_set_io(SA1111_DEV(skt->dev), pa_dwr_mask, pa_dwr_set); + local_irq_restore(flags); + } + + return ret; } static struct pcmcia_low_level jornada720_pcmcia_ops = { - .owner = THIS_MODULE, - .hw_init = jornada720_pcmcia_hw_init, - .hw_shutdown = sa1111_pcmcia_hw_shutdown, - .socket_state = sa1111_pcmcia_socket_state, - .configure_socket = jornada720_pcmcia_configure_socket, - - .socket_init = sa1111_pcmcia_socket_init, - .socket_suspend = sa1111_pcmcia_socket_suspend, + .owner = THIS_MODULE, + .hw_init = jornada720_pcmcia_hw_init, + .hw_shutdown = sa1111_pcmcia_hw_shutdown, + .socket_state = sa1111_pcmcia_socket_state, + .configure_socket = jornada720_pcmcia_configure_socket, + + .socket_init = sa1111_pcmcia_socket_init, + .socket_suspend = sa1111_pcmcia_socket_suspend, }; int __devinit pcmcia_jornada720_init(struct device *dev) diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index 737fe5d87c4..b459e87a30a 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -717,7 +717,7 @@ static void yenta_free_resources(struct yenta_socket *socket) /* * Close it down - release our resources and go home.. */ -static void yenta_close(struct pci_dev *dev) +static void __devexit yenta_close(struct pci_dev *dev) { struct yenta_socket *sock = pci_get_drvdata(dev); diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index 527ee764c93..cd11b113494 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -135,6 +135,15 @@ static int pnp_device_remove(struct device *dev) return 0; } +static void pnp_device_shutdown(struct device *dev) +{ + struct pnp_dev *pnp_dev = to_pnp_dev(dev); + struct pnp_driver *drv = pnp_dev->driver; + + if (drv && drv->shutdown) + drv->shutdown(pnp_dev); +} + static int pnp_bus_match(struct device *dev, struct device_driver *drv) { struct pnp_dev *pnp_dev = to_pnp_dev(dev); @@ -203,6 +212,7 @@ struct bus_type pnp_bus_type = { .match = pnp_bus_match, .probe = pnp_device_probe, .remove = pnp_device_remove, + .shutdown = pnp_device_shutdown, .suspend = pnp_bus_suspend, .resume = pnp_bus_resume, .dev_attrs = pnp_interface_attrs, diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index bd1ce8e2bc1..0587d53987f 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -430,7 +430,7 @@ fail: static int __exit omap_rtc_remove(struct platform_device *pdev) { - struct rtc_device *rtc = platform_get_drvdata(pdev);; + struct rtc_device *rtc = platform_get_drvdata(pdev); device_init_wakeup(&pdev->dev, 0); diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index e109da4583a..dad0449475b 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2146,7 +2146,7 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -struct block_device_operations +const struct block_device_operations dasd_device_operations = { .owner = THIS_MODULE, .open = dasd_open, diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index a1ce573648a..bd9fe2e36dc 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -706,7 +706,7 @@ static int dasd_eckd_generate_uid(struct dasd_device *device, sizeof(uid->serial) - 1); EBCASC(uid->serial, sizeof(uid->serial) - 1); uid->ssid = private->gneq->subsystemID; - uid->real_unit_addr = private->ned->unit_addr;; + uid->real_unit_addr = private->ned->unit_addr; if (private->sneq) { uid->type = private->sneq->sua_flags; if (uid->type == UA_BASE_PAV_ALIAS) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 5e47a1ee52b..8afd9fa0087 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -540,7 +540,7 @@ dasd_check_blocksize(int bsize) extern debug_info_t *dasd_debug_area; extern struct dasd_profile_info_t dasd_global_profile; extern unsigned int dasd_profile_level; -extern struct block_device_operations dasd_device_operations; +extern const struct block_device_operations dasd_device_operations; extern struct kmem_cache *dasd_page_cache; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index d34617682a6..f76f4bd82b9 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -34,7 +34,7 @@ static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum, static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; static int dcssblk_major; -static struct block_device_operations dcssblk_devops = { +static const struct block_device_operations dcssblk_devops = { .owner = THIS_MODULE, .open = dcssblk_open, .release = dcssblk_release, diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index ee604e92a5f..116d1b3eeb1 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -244,7 +244,7 @@ static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static struct block_device_operations xpram_devops = +static const struct block_device_operations xpram_devops = { .owner = THIS_MODULE, .getgeo = xpram_getgeo, diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 4cb9e70507a..64f57ef2763 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -50,7 +50,7 @@ static int tapeblock_ioctl(struct block_device *, fmode_t, unsigned int, static int tapeblock_medium_changed(struct gendisk *); static int tapeblock_revalidate_disk(struct gendisk *); -static struct block_device_operations tapeblock_fops = { +static const struct block_device_operations tapeblock_fops = { .owner = THIS_MODULE, .open = tapeblock_open, .release = tapeblock_release, diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c index a4b2c576144..c84eadd3602 100644 --- a/drivers/s390/net/netiucv.c +++ b/drivers/s390/net/netiucv.c @@ -2113,7 +2113,7 @@ static ssize_t remove_write (struct device_driver *drv, IUCV_DBF_TEXT(trace, 3, __func__); if (count >= IFNAMSIZ) - count = IFNAMSIZ - 1;; + count = IFNAMSIZ - 1; for (i = 0, p = buf; i < count && *p; i++, p++) { if (*p == '\n' || *p == ' ') diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 3ff726afafc..0e1a34627a2 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -102,7 +102,7 @@ static int zfcp_scsi_queuecommand(struct scsi_cmnd *scpnt, if (unlikely((status & ZFCP_STATUS_COMMON_ERP_FAILED) || !(status & ZFCP_STATUS_COMMON_RUNNING))) { zfcp_scsi_command_fail(scpnt, DID_ERROR); - return 0;; + return 0; } ret = zfcp_fsf_send_fcp_command_task(unit, scpnt); diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 6d465168468..869a30b49ed 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -452,7 +452,7 @@ static const struct file_operations jsf_fops = { static struct miscdevice jsf_dev = { JSF_MINOR, "jsflash", &jsf_fops }; -static struct block_device_operations jsfd_fops = { +static const struct block_device_operations jsfd_fops = { .owner = THIS_MODULE, }; diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c index e6f2bb7365e..8dfb59d5899 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_core.c +++ b/drivers/scsi/aic7xxx/aic7xxx_core.c @@ -5223,7 +5223,7 @@ ahc_chip_init(struct ahc_softc *ahc) /* * Setup the allowed SCSI Sequences based on operational mode. - * If we are a target, we'll enalbe select in operations once + * If we are a target, we'll enable select in operations once * we've had a lun enabled. */ scsiseq_template = ENSELO|ENAUTOATNO|ENAUTOATNP; diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index 906cef5cda8..41e1b0e7e2e 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1340,7 +1340,7 @@ static int bnx2i_process_login_resp(struct iscsi_session *session, resp_hdr->opcode = login->op_code; resp_hdr->flags = login->response_flags; resp_hdr->max_version = login->version_max; - resp_hdr->active_version = login->version_active;; + resp_hdr->active_version = login->version_active; resp_hdr->hlength = 0; hton24(resp_hdr->dlength, login->data_length); diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 9df7ed38e1b..9a1bd9534d7 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -1207,7 +1207,7 @@ lpfc_ns_cmd(struct lpfc_vport *vport, int cmdcode, vport->ct_flags &= ~FC_CT_RFF_ID; CtReq->CommandResponse.bits.CmdRsp = be16_to_cpu(SLI_CTNS_RFF_ID); - CtReq->un.rff.PortId = cpu_to_be32(vport->fc_myDID);; + CtReq->un.rff.PortId = cpu_to_be32(vport->fc_myDID); CtReq->un.rff.fbits = FC4_FEATURE_INIT; CtReq->un.rff.type_code = FC_FCP_DATA; cmpl = lpfc_cmpl_ct_cmd_rff_id; diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c index 7dc3d1894b1..a39addc3a59 100644 --- a/drivers/scsi/megaraid/megaraid_sas.c +++ b/drivers/scsi/megaraid/megaraid_sas.c @@ -718,7 +718,7 @@ megasas_build_dcdb(struct megasas_instance *instance, struct scsi_cmnd *scp, * megasas_build_ldio - Prepares IOs to logical devices * @instance: Adapter soft state * @scp: SCSI command - * @cmd: Command to to be prepared + * @cmd: Command to be prepared * * Frames (and accompanying SGLs) for regular SCSI IOs use this function. */ diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 40e3cafb3a9..83c8b5e4fc8 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -1422,7 +1422,7 @@ static void qla4xxx_slave_destroy(struct scsi_device *sdev) /** * qla4xxx_del_from_active_array - returns an active srb * @ha: Pointer to host adapter structure. - * @index: index into to the active_array + * @index: index into the active_array * * This routine removes and returns the srb at the specified index **/ @@ -1500,7 +1500,7 @@ static int qla4xxx_wait_for_hba_online(struct scsi_qla_host *ha) /** * qla4xxx_eh_wait_for_commands - wait for active cmds to finish. - * @ha: pointer to to HBA + * @ha: pointer to HBA * @t: target id * @l: lun id * diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a89c421dab5..8dd96dcd716 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -956,7 +956,7 @@ static int sd_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static struct block_device_operations sd_fops = { +static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index cce0fe4c8a3..eb61f7a70e1 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -525,7 +525,7 @@ static int sr_block_media_changed(struct gendisk *disk) return cdrom_media_changed(&cd->cdi); } -static struct block_device_operations sr_bdops = +static const struct block_device_operations sr_bdops = { .owner = THIS_MODULE, .open = sr_block_open, diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c index 8980a5640bd..e75ba9b2889 100644 --- a/drivers/spi/omap_uwire.c +++ b/drivers/spi/omap_uwire.c @@ -213,7 +213,7 @@ static int uwire_txrx(struct spi_device *spi, struct spi_transfer *t) unsigned bits = ust->bits_per_word; unsigned bytes; u16 val, w; - int status = 0;; + int status = 0; if (!t->tx_buf && !t->rx_buf) return 0; diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c index 3f3119d760d..6ba8aece90b 100644 --- a/drivers/spi/spi_s3c24xx.c +++ b/drivers/spi/spi_s3c24xx.c @@ -388,7 +388,7 @@ static int __init s3c24xx_spi_probe(struct platform_device *pdev) err_no_iores: err_no_pdata: - spi_master_put(hw->master);; + spi_master_put(hw->master); err_nomem: return err; diff --git a/drivers/staging/rt2860/rtmp.h b/drivers/staging/rt2860/rtmp.h index 3f498f6f3ff..90fd40f2473 100644 --- a/drivers/staging/rt2860/rtmp.h +++ b/drivers/staging/rt2860/rtmp.h @@ -2060,7 +2060,7 @@ typedef struct _STA_ADMIN_CONFIG { BOOLEAN AdhocBGJoined; // Indicate Adhoc B/G Join. BOOLEAN Adhoc20NJoined; // Indicate Adhoc 20MHz N Join. #endif - // New for WPA, windows want us to to keep association information and + // New for WPA, windows want us to keep association information and // Fixed IEs from last association response NDIS_802_11_ASSOCIATION_INFORMATION AssocInfo; USHORT ReqVarIELen; // Length of next VIE include EID & Length diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index ba589d4ca8b..8c64c018b67 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -506,8 +506,6 @@ static int wdm_open(struct inode *inode, struct file *file) desc = usb_get_intfdata(intf); if (test_bit(WDM_DISCONNECTING, &desc->flags)) goto out; - - ; file->private_data = desc; rv = usb_autopm_get_interface(desc->intf); diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index ffe75e83787..97b40ce133f 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -48,7 +48,6 @@ #define USBFS_DEFAULT_BUSMODE (S_IXUGO | S_IRUGO) #define USBFS_DEFAULT_LISTMODE S_IRUGO -static struct super_operations usbfs_ops; static const struct file_operations default_file_operations; static struct vfsmount *usbfs_mount; static int usbfs_mount_count; /* = 0 */ @@ -449,7 +448,7 @@ static const struct file_operations default_file_operations = { .llseek = default_file_lseek, }; -static struct super_operations usbfs_ops = { +static const struct super_operations usbfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .remount_fs = remount, diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index 7d33f50b587..c44367fea18 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -2033,7 +2033,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name, return inode; } -static struct super_operations gadget_fs_operations = { +static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c index c2f1b7df918..b5b83c43898 100644 --- a/drivers/usb/host/ehci-pci.c +++ b/drivers/usb/host/ehci-pci.c @@ -242,7 +242,7 @@ static int ehci_pci_setup(struct usb_hcd *hcd) * System suspend currently expects to be able to suspend the entire * device tree, device-at-a-time. If we failed selective suspend * reports, system suspend would fail; so the root hub code must claim - * success. That's lying to usbcore, and it matters for for runtime + * success. That's lying to usbcore, and it matters for runtime * PM scenarios with selective suspend and remote wakeup... */ if (ehci->no_selective_suspend && device_can_wakeup(&pdev->dev)) diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h index 2bfff30f470..48b9e889a18 100644 --- a/drivers/usb/host/ehci.h +++ b/drivers/usb/host/ehci.h @@ -37,7 +37,7 @@ typedef __u16 __bitwise __hc16; #define __hc16 __le16 #endif -/* statistics can be kept for for tuning/monitoring */ +/* statistics can be kept for tuning/monitoring */ struct ehci_stats { /* irq usage */ unsigned long normal; diff --git a/drivers/usb/host/ohci-q.c b/drivers/usb/host/ohci-q.c index c2d80f80448..16fecb8ecc3 100644 --- a/drivers/usb/host/ohci-q.c +++ b/drivers/usb/host/ohci-q.c @@ -418,7 +418,7 @@ static struct ed *ed_get ( is_out = !(ep->desc.bEndpointAddress & USB_DIR_IN); /* FIXME usbcore changes dev->devnum before SET_ADDRESS - * suceeds ... otherwise we wouldn't need "pipe". + * succeeds ... otherwise we wouldn't need "pipe". */ info = usb_pipedevice (pipe); ed->type = usb_pipetype(pipe); diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index d31d32206ba..ffe1625d4e1 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1150,7 +1150,7 @@ void xhci_dbg_cmd_ptrs(struct xhci_hcd *xhci); void xhci_dbg_ring_ptrs(struct xhci_hcd *xhci, struct xhci_ring *ring); void xhci_dbg_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx, unsigned int last_ep); -/* xHCI memory managment */ +/* xHCI memory management */ void xhci_mem_cleanup(struct xhci_hcd *xhci); int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags); void xhci_free_virt_device(struct xhci_hcd *xhci, int slot_id); diff --git a/drivers/usb/serial/cypress_m8.h b/drivers/usb/serial/cypress_m8.h index e772b01ac3a..1fd360e0406 100644 --- a/drivers/usb/serial/cypress_m8.h +++ b/drivers/usb/serial/cypress_m8.h @@ -57,7 +57,7 @@ #define UART_RI 0x10 /* ring indicator - modem - device to host */ #define UART_CD 0x40 /* carrier detect - modem - device to host */ #define CYP_ERROR 0x08 /* received from input report - device to host */ -/* Note - the below has nothing to to with the "feature report" reset */ +/* Note - the below has nothing to do with the "feature report" reset */ #define CONTROL_RESET 0x08 /* sent with output report - host to device */ /* End of RS-232 protocol definitions */ diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c index dc0f832657e..b97960ac92f 100644 --- a/drivers/usb/serial/io_edgeport.c +++ b/drivers/usb/serial/io_edgeport.c @@ -2540,7 +2540,7 @@ static int calc_baud_rate_divisor(int baudrate, int *divisor) /***************************************************************************** * send_cmd_write_uart_register - * this function builds up a uart register message and sends to to the device. + * this function builds up a uart register message and sends to the device. *****************************************************************************/ static int send_cmd_write_uart_register(struct edgeport_port *edge_port, __u8 regNum, __u8 regValue) diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c index a61673133d7..f7373371b13 100644 --- a/drivers/usb/serial/kl5kusb105.c +++ b/drivers/usb/serial/kl5kusb105.c @@ -38,7 +38,7 @@ * 0.3a - implemented pools of write URBs * 0.3 - alpha version for public testing * 0.2 - TIOCMGET works, so autopilot(1) can be used! - * 0.1 - can be used to to pilot-xfer -p /dev/ttyUSB0 -l + * 0.1 - can be used to do pilot-xfer -p /dev/ttyUSB0 -l * * The driver skeleton is mainly based on mct_u232.c and various other * pieces of code shamelessly copied from the drivers/usb/serial/ directory. diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c index 61e7c40b94f..1e58220403d 100644 --- a/drivers/usb/serial/spcp8x5.c +++ b/drivers/usb/serial/spcp8x5.c @@ -544,7 +544,7 @@ static void spcp8x5_set_termios(struct tty_struct *tty, } /* Set Baud Rate */ - baud = tty_get_baud_rate(tty);; + baud = tty_get_baud_rate(tty); switch (baud) { case 300: buf[0] = 0x00; break; case 600: buf[0] = 0x01; break; diff --git a/drivers/usb/wusbcore/wa-hc.h b/drivers/usb/wusbcore/wa-hc.h index 586d350cdb4..d6bea3e0b54 100644 --- a/drivers/usb/wusbcore/wa-hc.h +++ b/drivers/usb/wusbcore/wa-hc.h @@ -47,7 +47,7 @@ * to an endpoint on a WUSB device that is connected to a * HWA RC. * - * xfer Transfer managment -- this is all the code that gets a + * xfer Transfer management -- this is all the code that gets a * buffer and pushes it to a device (or viceversa). * * * Some day a lot of this code will be shared between this driver and diff --git a/drivers/uwb/i1480/i1480u-wlp/netdev.c b/drivers/uwb/i1480/i1480u-wlp/netdev.c index 73055530e60..b236e696994 100644 --- a/drivers/uwb/i1480/i1480u-wlp/netdev.c +++ b/drivers/uwb/i1480/i1480u-wlp/netdev.c @@ -214,7 +214,7 @@ int i1480u_open(struct net_device *net_dev) netif_wake_queue(net_dev); #ifdef i1480u_FLOW_CONTROL - result = usb_submit_urb(i1480u->notif_urb, GFP_KERNEL);; + result = usb_submit_urb(i1480u->notif_urb, GFP_KERNEL); if (result < 0) { dev_err(dev, "Can't submit notification URB: %d\n", result); goto error_notif_urb_submit; diff --git a/drivers/video/cfbcopyarea.c b/drivers/video/cfbcopyarea.c index df03f3776dc..79e5f40e648 100644 --- a/drivers/video/cfbcopyarea.c +++ b/drivers/video/cfbcopyarea.c @@ -114,7 +114,7 @@ bitcpy(struct fb_info *p, unsigned long __iomem *dst, int dst_idx, d0 >>= right; } else if (src_idx+n <= bits) { // Single source word - d0 <<= left;; + d0 <<= left; } else { // 2 source words d1 = FB_READL(src + 1); diff --git a/drivers/video/imxfb.c b/drivers/video/imxfb.c index 30ae3022f63..66358fa825f 100644 --- a/drivers/video/imxfb.c +++ b/drivers/video/imxfb.c @@ -710,7 +710,7 @@ static int __init imxfb_probe(struct platform_device *pdev) fbi->clk = clk_get(&pdev->dev, NULL); if (IS_ERR(fbi->clk)) { - ret = PTR_ERR(fbi->clk);; + ret = PTR_ERR(fbi->clk); dev_err(&pdev->dev, "unable to get clock: %d\n", ret); goto failed_getclock; } diff --git a/drivers/video/omap/lcd_h3.c b/drivers/video/omap/lcd_h3.c index 2486237ebba..417ae5efa8b 100644 --- a/drivers/video/omap/lcd_h3.c +++ b/drivers/video/omap/lcd_h3.c @@ -124,12 +124,12 @@ struct platform_driver h3_panel_driver = { }, }; -static int h3_panel_drv_init(void) +static int __init h3_panel_drv_init(void) { return platform_driver_register(&h3_panel_driver); } -static void h3_panel_drv_cleanup(void) +static void __exit h3_panel_drv_cleanup(void) { platform_driver_unregister(&h3_panel_driver); } diff --git a/drivers/video/omap/lcd_h4.c b/drivers/video/omap/lcd_h4.c index 6ff56430341..0c398bda760 100644 --- a/drivers/video/omap/lcd_h4.c +++ b/drivers/video/omap/lcd_h4.c @@ -102,12 +102,12 @@ static struct platform_driver h4_panel_driver = { }, }; -static int h4_panel_drv_init(void) +static int __init h4_panel_drv_init(void) { return platform_driver_register(&h4_panel_driver); } -static void h4_panel_drv_cleanup(void) +static void __exit h4_panel_drv_cleanup(void) { platform_driver_unregister(&h4_panel_driver); } diff --git a/drivers/video/omap/lcd_inn1510.c b/drivers/video/omap/lcd_inn1510.c index 6953ed4b582..cdbd8bb607b 100644 --- a/drivers/video/omap/lcd_inn1510.c +++ b/drivers/video/omap/lcd_inn1510.c @@ -109,12 +109,12 @@ struct platform_driver innovator1510_panel_driver = { }, }; -static int innovator1510_panel_drv_init(void) +static int __init innovator1510_panel_drv_init(void) { return platform_driver_register(&innovator1510_panel_driver); } -static void innovator1510_panel_drv_cleanup(void) +static void __exit innovator1510_panel_drv_cleanup(void) { platform_driver_unregister(&innovator1510_panel_driver); } diff --git a/drivers/video/omap/lcd_inn1610.c b/drivers/video/omap/lcd_inn1610.c index 4c4f7ee6d73..268f7f808a4 100644 --- a/drivers/video/omap/lcd_inn1610.c +++ b/drivers/video/omap/lcd_inn1610.c @@ -133,12 +133,12 @@ struct platform_driver innovator1610_panel_driver = { }, }; -static int innovator1610_panel_drv_init(void) +static int __init innovator1610_panel_drv_init(void) { return platform_driver_register(&innovator1610_panel_driver); } -static void innovator1610_panel_drv_cleanup(void) +static void __exit innovator1610_panel_drv_cleanup(void) { platform_driver_unregister(&innovator1610_panel_driver); } diff --git a/drivers/video/omap/lcd_osk.c b/drivers/video/omap/lcd_osk.c index 379c96d36da..b3fa88bc626 100644 --- a/drivers/video/omap/lcd_osk.c +++ b/drivers/video/omap/lcd_osk.c @@ -127,12 +127,12 @@ struct platform_driver osk_panel_driver = { }, }; -static int osk_panel_drv_init(void) +static int __init osk_panel_drv_init(void) { return platform_driver_register(&osk_panel_driver); } -static void osk_panel_drv_cleanup(void) +static void __exit osk_panel_drv_cleanup(void) { platform_driver_unregister(&osk_panel_driver); } diff --git a/drivers/video/omap/lcd_palmte.c b/drivers/video/omap/lcd_palmte.c index 218317366e6..4bf3c79f3cc 100644 --- a/drivers/video/omap/lcd_palmte.c +++ b/drivers/video/omap/lcd_palmte.c @@ -108,12 +108,12 @@ struct platform_driver palmte_panel_driver = { }, }; -static int palmte_panel_drv_init(void) +static int __init palmte_panel_drv_init(void) { return platform_driver_register(&palmte_panel_driver); } -static void palmte_panel_drv_cleanup(void) +static void __exit palmte_panel_drv_cleanup(void) { platform_driver_unregister(&palmte_panel_driver); } diff --git a/drivers/video/omap/lcd_palmtt.c b/drivers/video/omap/lcd_palmtt.c index 57b0f6cf6a5..48ea1f9f2cb 100644 --- a/drivers/video/omap/lcd_palmtt.c +++ b/drivers/video/omap/lcd_palmtt.c @@ -113,12 +113,12 @@ struct platform_driver palmtt_panel_driver = { }, }; -static int palmtt_panel_drv_init(void) +static int __init palmtt_panel_drv_init(void) { return platform_driver_register(&palmtt_panel_driver); } -static void palmtt_panel_drv_cleanup(void) +static void __exit palmtt_panel_drv_cleanup(void) { platform_driver_unregister(&palmtt_panel_driver); } diff --git a/drivers/video/omap/lcd_palmz71.c b/drivers/video/omap/lcd_palmz71.c index d33d78b1172..0697d29b4d3 100644 --- a/drivers/video/omap/lcd_palmz71.c +++ b/drivers/video/omap/lcd_palmz71.c @@ -109,12 +109,12 @@ struct platform_driver palmz71_panel_driver = { }, }; -static int palmz71_panel_drv_init(void) +static int __init palmz71_panel_drv_init(void) { return platform_driver_register(&palmz71_panel_driver); } -static void palmz71_panel_drv_cleanup(void) +static void __exit palmz71_panel_drv_cleanup(void) { platform_driver_unregister(&palmz71_panel_driver); } diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c index 7da0027e240..5ffca2adc6a 100644 --- a/drivers/video/s3c2410fb.c +++ b/drivers/video/s3c2410fb.c @@ -1119,7 +1119,7 @@ int __init s3c2410fb_init(void) int ret = platform_driver_register(&s3c2410fb_driver); if (ret == 0) - ret = platform_driver_register(&s3c2412fb_driver);; + ret = platform_driver_register(&s3c2412fb_driver); return ret; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index f5bbd9e8341..d31505b6f7a 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -96,11 +96,7 @@ static struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; -/* VM /proc information for memory */ -extern unsigned long totalram_pages; - #ifdef CONFIG_HIGHMEM -extern unsigned long totalhigh_pages; #define inc_totalhigh_pages() (totalhigh_pages++) #define dec_totalhigh_pages() (totalhigh_pages--) #else @@ -214,7 +210,7 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { BUG_ON(page == NULL); - frame_list[i] = page_to_pfn(page);; + frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } diff --git a/fs/Kconfig b/fs/Kconfig index 455aa207e67..d4bf8caad8d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -109,6 +109,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Virtual memory file system support (former shm fs)" + depends on SHMEM help Tmpfs is a file system which keeps all files in virtual memory. diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 3ff8bdd18fb..0931bc1325e 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl); static struct workqueue_struct *afs_lock_manager; static DEFINE_MUTEX(afs_lock_manager_mutex); -static struct file_lock_operations afs_lock_ops = { +static const struct file_lock_operations afs_lock_ops = { .fl_copy_lock = afs_fl_copy_lock, .fl_release_private = afs_fl_release_private, }; @@ -24,6 +24,7 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/mmu_context.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -34,7 +35,6 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> -#include <asm/mmu_context.h> #if DEBUG > 1 #define dprintk printk @@ -595,51 +595,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) } /* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -static void use_mm(struct mm_struct *mm) -{ - struct mm_struct *active_mm; - struct task_struct *tsk = current; - - task_lock(tsk); - active_mm = tsk->active_mm; - atomic_inc(&mm->mm_count); - tsk->mm = mm; - tsk->active_mm = mm; - switch_mm(active_mm, mm, tsk); - task_unlock(tsk); - - mmdrop(active_mm); -} - -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -static void unuse_mm(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - tsk->mm = NULL; - /* active_mm is still 'mm' */ - enter_lazy_tlb(mm, tsk); - task_unlock(tsk); -} - -/* * Queue up a kiocb to be retried. Assumes that the kiocb * has already been marked as kicked, and places it on * the retry run list for the corresponding ioctx, if it diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c index 2316e944a10..e947915109e 100644 --- a/fs/autofs/dirhash.c +++ b/fs/autofs/dirhash.c @@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - while (d_mountpoint(path.dentry) && follow_down(&path)); + while (d_mountpoint(path.dentry) && follow_down(&path)) ; umount_ok = may_umount(path.mnt); path_put(&path); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 615d5496fe0..dd376c124e7 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -842,7 +842,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = BEFS_SUPER_MAGIC; /* Set real blocksize of fs */ sb_set_blocksize(sb, (ulong) befs_sb->block_size); - sb->s_op = (struct super_operations *) &befs_sops; + sb->s_op = &befs_sops; root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); if (IS_ERR(root)) { ret = PTR_ERR(root); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7c1e65d5487..442d94fe255 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file, #define DUMP_WRITE(addr, nr) \ if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(off) \ - if (!dump_seek(file, (off))) \ - goto end_coredump; static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags, u8 osabi) @@ -2016,7 +2013,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un goto end_coredump; /* Align to page */ - DUMP_SEEK(dataoff - foffset); + if (!dump_seek(file, dataoff - foffset)) + goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { @@ -2027,33 +2025,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; - struct vm_area_struct *tmp_vma; - - if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &tmp_vma) <= 0) { - DUMP_SEEK(PAGE_SIZE); - } else { - if (page == ZERO_PAGE(0)) { - if (!dump_seek(file, PAGE_SIZE)) { - page_cache_release(page); - goto end_coredump; - } - } else { - void *kaddr; - flush_cache_page(tmp_vma, addr, - page_to_pfn(page)); - kaddr = kmap(page); - if ((size += PAGE_SIZE) > limit || - !dump_write(file, kaddr, - PAGE_SIZE)) { - kunmap(page); - page_cache_release(page); - goto end_coredump; - } - kunmap(page); - } + int stop; + + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + stop = ((size += PAGE_SIZE) > limit) || + !dump_write(file, kaddr, PAGE_SIZE); + kunmap(page); page_cache_release(page); - } + } else + stop = !dump_seek(file, PAGE_SIZE); + if (stop) + goto end_coredump; } } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 20fbeced472..76285471073 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1325,9 +1325,6 @@ static int writenote(struct memelfnote *men, struct file *file) #define DUMP_WRITE(addr, nr) \ if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(off) \ - if (!dump_seek(file, (off))) \ - goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) { @@ -1518,6 +1515,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, unsigned long *limit, unsigned long mm_flags) { struct vm_area_struct *vma; + int err = 0; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { unsigned long addr; @@ -1525,43 +1523,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, if (!maydump(vma, mm_flags)) continue; - for (addr = vma->vm_start; - addr < vma->vm_end; - addr += PAGE_SIZE - ) { - struct vm_area_struct *vma; - struct page *page; - - if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { - DUMP_SEEK(file->f_pos + PAGE_SIZE); - } - else if (page == ZERO_PAGE(0)) { - page_cache_release(page); - DUMP_SEEK(file->f_pos + PAGE_SIZE); - } - else { - void *kaddr; - - flush_cache_page(vma, addr, page_to_pfn(page)); - kaddr = kmap(page); - if ((*size += PAGE_SIZE) > *limit || - !dump_write(file, kaddr, PAGE_SIZE) - ) { - kunmap(page); - page_cache_release(page); - return -EIO; - } + for (addr = vma->vm_start; addr < vma->vm_end; + addr += PAGE_SIZE) { + struct page *page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + *size += PAGE_SIZE; + if (*size > *limit) + err = -EFBIG; + else if (!dump_write(file, kaddr, PAGE_SIZE)) + err = -EIO; kunmap(page); page_cache_release(page); - } + } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) + err = -EFBIG; + if (err) + goto out; } } - - return 0; - -end_coredump: - return -EFBIG; +out: + return err; } #endif @@ -1802,7 +1783,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, goto end_coredump; } - DUMP_SEEK(dataoff); + if (!dump_seek(file, dataoff)) + goto end_coredump; if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) goto end_coredump; diff --git a/fs/block_dev.c b/fs/block_dev.c index 71e7e03ac34..5d1ed50bd46 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1114,7 +1114,7 @@ EXPORT_SYMBOL(revalidate_disk); int check_disk_change(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - struct block_device_operations * bdops = disk->fops; + const struct block_device_operations *bdops = disk->fops; if (!bdops->media_changed) return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8b819279001..6c4173146bb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -772,7 +772,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -static struct address_space_operations btree_aops = { +static const struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, .writepages = btree_writepages, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59cba180fe8..9096fd0ca3c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -55,13 +55,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; -static struct inode_operations btrfs_dir_inode_operations; -static struct inode_operations btrfs_symlink_inode_operations; -static struct inode_operations btrfs_dir_ro_inode_operations; -static struct inode_operations btrfs_special_inode_operations; -static struct inode_operations btrfs_file_inode_operations; -static struct address_space_operations btrfs_aops; -static struct address_space_operations btrfs_symlink_aops; +static const struct inode_operations btrfs_dir_inode_operations; +static const struct inode_operations btrfs_symlink_inode_operations; +static const struct inode_operations btrfs_dir_ro_inode_operations; +static const struct inode_operations btrfs_special_inode_operations; +static const struct inode_operations btrfs_file_inode_operations; +static const struct address_space_operations btrfs_aops; +static const struct address_space_operations btrfs_symlink_aops; static struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; @@ -5201,7 +5201,7 @@ static int btrfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask, btrfs_check_acl); } -static struct inode_operations btrfs_dir_inode_operations = { +static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, .create = btrfs_create, @@ -5219,7 +5219,7 @@ static struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, }; -static struct inode_operations btrfs_dir_ro_inode_operations = { +static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, }; @@ -5259,7 +5259,7 @@ static struct extent_io_ops btrfs_extent_io_ops = { * * For now we're avoiding this by dropping bmap. */ -static struct address_space_operations btrfs_aops = { +static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, @@ -5271,14 +5271,14 @@ static struct address_space_operations btrfs_aops = { .set_page_dirty = btrfs_set_page_dirty, }; -static struct address_space_operations btrfs_symlink_aops = { +static const struct address_space_operations btrfs_symlink_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, }; -static struct inode_operations btrfs_file_inode_operations = { +static const struct inode_operations btrfs_file_inode_operations = { .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, @@ -5290,7 +5290,7 @@ static struct inode_operations btrfs_file_inode_operations = { .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; -static struct inode_operations btrfs_special_inode_operations = { +static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, @@ -5299,7 +5299,7 @@ static struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, }; -static struct inode_operations btrfs_symlink_inode_operations = { +static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6d6d06cb6df..2db17cd66fc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,7 +51,7 @@ #include "export.h" #include "compression.h" -static struct super_operations btrfs_super_ops; +static const struct super_operations btrfs_super_ops; static void btrfs_put_super(struct super_block *sb) { @@ -675,7 +675,7 @@ static int btrfs_unfreeze(struct super_block *sb) return 0; } -static struct super_operations btrfs_super_ops = { +static const struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d91b0de7c50..30c0d45c1b5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2605,7 +2605,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, extent); cs = btrfs_file_extent_offset(src, extent); cl = btrfs_file_extent_num_bytes(src, - extent);; + extent); if (btrfs_file_extent_compression(src, extent)) { cs = 0; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 606912d8f2a..fea9e898c4b 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc != 0) { cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", - __func__, *devname, rc));; + __func__, *devname, rc)); goto compose_mount_options_err; } /* md_len = strlen(...) + 12 for 'sep+prefixpath=' @@ -385,7 +385,7 @@ out_err: goto out; } -struct inode_operations cifs_dfs_referral_inode_operations = { +const struct inode_operations cifs_dfs_referral_inode_operations = { .follow_link = cifs_dfs_follow_mountpoint, }; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3610e9958b4..d79ce2e95c2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -50,7 +50,7 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ #ifdef CONFIG_CIFS_QUOTA -static struct quotactl_ops cifs_quotactl_ops; +static const struct quotactl_ops cifs_quotactl_ops; #endif /* QUOTA */ int cifsFYI = 0; @@ -517,7 +517,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats) return rc; } -static struct quotactl_ops cifs_quotactl_ops = { +static const struct quotactl_ops cifs_quotactl_ops = { .set_xquota = cifs_xquota_set, .get_xquota = cifs_xquota_get, .set_xstate = cifs_xstate_set, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 094325e3f71..ac2b24c192f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *); extern const struct inode_operations cifs_file_inode_ops; extern const struct inode_operations cifs_symlink_inode_ops; -extern struct inode_operations cifs_dfs_referral_inode_operations; +extern const struct inode_operations cifs_dfs_referral_inode_operations; /* Functions related to files and directories */ diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 00b30a2d546..542f625312f 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops; extern const struct inode_operations ecryptfs_symlink_iops; extern const struct super_operations ecryptfs_sops; extern const struct dentry_operations ecryptfs_dops; -extern struct address_space_operations ecryptfs_aops; +extern const struct address_space_operations ecryptfs_aops; extern int ecryptfs_verbosity; extern unsigned int ecryptfs_message_buf_len; extern signed long ecryptfs_message_wait_timeout; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 5c6bab9786e..05772aeaa8f 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -545,7 +545,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) return rc; } -struct address_space_operations ecryptfs_aops = { +const struct address_space_operations ecryptfs_aops = { .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, .write_begin = ecryptfs_write_begin, diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index b72b8588422..c18fbf3e406 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block, void **kaddr, unsigned long *pfn) { struct block_device *bdev = inode->i_sb->s_bdev; - struct block_device_operations *ops = bdev->bd_disk->fops; + const struct block_device_operations *ops = bdev->bd_disk->fops; sector_t sector; sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index a8d80a7f110..72743d36050 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); -static struct dquot_operations ext3_quota_operations = { +static const struct dquot_operations ext3_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -737,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = { .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops ext3_qctl_operations = { +static const struct quotactl_ops ext3_qctl_operations = { .quota_on = ext3_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4abd683b963..3a798737e30 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2337,7 +2337,7 @@ static int __mpage_da_writepage(struct page *page, /* * Rest of the page in the page_vec * redirty then and skip then. We will - * try to to write them again after + * try to write them again after * starting a new transaction */ redirty_page_for_writepage(wbc, page); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a6b1ab73472..df539ba2777 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -964,7 +964,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); -static struct dquot_operations ext4_quota_operations = { +static const struct dquot_operations ext4_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -985,7 +985,7 @@ static struct dquot_operations ext4_quota_operations = { .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops ext4_qctl_operations = { +static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 28c590b7c9d..8f1cfb02a6c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) * always aligned to a 64 bit boundary. * * The size of the buffer is in bytes, but is it assumed that it is - * always ok to to read a complete multiple of 64 bits at the end + * always ok to read a complete multiple of 64 bits at the end * of the block in case the end is no aligned to a natural boundary. * * Return: the block number (bitmap buffer scope) that was found diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a93b885311d..06b7c2623f9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -507,6 +507,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; INIT_LIST_HEAD(&inode->i_mapping->private_list); info = HUGETLBFS_I(inode); + /* + * The policy is initialized here even if we are creating a + * private inode because initialization simply creates an + * an empty rb tree and calls spin_lock_init(), later when we + * call mpol_free_shared_policy() it will just return because + * the rb tree will still be empty. + */ mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: @@ -931,13 +938,19 @@ static struct file_system_type hugetlbfs_fs_type = { static struct vfsmount *hugetlbfs_vfsmount; -static int can_do_hugetlb_shm(void) +static int can_do_hugetlb_shm(int creat_flags) { - return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); + if (creat_flags != HUGETLB_SHMFS_INODE) + return 0; + if (capable(CAP_IPC_LOCK)) + return 1; + if (in_group_p(sysctl_hugetlb_shm_group)) + return 1; + return 0; } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, - struct user_struct **user) + struct user_struct **user, int creat_flags) { int error = -ENOMEM; struct file *file; @@ -949,7 +962,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm()) { + if (!can_do_hugetlb_shm(creat_flags)) { *user = current_user(); if (user_shm_lock(size, *user)) { WARN_ONCE(1, diff --git a/fs/inode.c b/fs/inode.c index b2ba83d2c4e..f5ff71cb3e2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -123,7 +123,7 @@ static void wake_up_inode(struct inode *inode) int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; - static struct inode_operations empty_iops; + static const struct inode_operations empty_iops; static const struct file_operations empty_fops; struct address_space *const mapping = &inode->i_data; @@ -695,13 +695,15 @@ void unlock_new_inode(struct inode *inode) } #endif /* - * This is special! We do not need the spinlock - * when clearing I_LOCK, because we're guaranteed - * that nobody else tries to do anything about the - * state of the inode when it is locked, as we - * just created it (so there can be no old holders - * that haven't tested I_LOCK). + * This is special! We do not need the spinlock when clearing I_LOCK, + * because we're guaranteed that nobody else tries to do anything about + * the state of the inode when it is locked, as we just created it (so + * there can be no old holders that haven't tested I_LOCK). + * However we must emit the memory barrier so that other CPUs reliably + * see the clearing of I_LOCK after the other inode initialisation has + * completed. */ + smp_mb(); WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); inode->i_state &= ~(I_LOCK|I_NEW); wake_up_inode(inode); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0035c021395..9a80e8e595d 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); } -static struct export_operations jffs2_export_ops = { +static const struct export_operations jffs2_export_ops = { .get_parent = jffs2_get_parent, .fh_to_dentry = jffs2_fh_to_dentry, .fh_to_parent = jffs2_fh_to_parent, diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc0d35..fc9032dc886 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 4336adba952..c81249fef11 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl) nlm_put_lockowner(fl->fl_u.nfs_fl.owner); } -static struct file_lock_operations nlmclnt_lock_ops = { +static const struct file_lock_operations nlmclnt_lock_ops = { .fl_copy_lock = nlmclnt_locks_copy_lock, .fl_release_private = nlmclnt_locks_release_private, }; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7cb076ac6b4..4600c2037b8 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -111,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) */ chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) + if (!rpc_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ @@ -125,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) if (host->h_server != ni->server) continue; if (ni->server && - !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) + !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 30c933188dd..f956651d0f6 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -209,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) struct nsm_handle *nsm; list_for_each_entry(nsm, &nsm_handles, sm_link) - if (nlm_cmp_addr(nsm_addr(nsm), sap)) + if (rpc_cmp_addr(nsm_addr(nsm), sap)) return nsm; return NULL; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e577a78d7ba..d1001790fa9 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; } -struct lock_manager_operations nlmsvc_lock_operations = { +const struct lock_manager_operations nlmsvc_lock_operations = { .fl_compare_owner = nlmsvc_same_owner, .fl_notify = nlmsvc_notify_blocked, .fl_grant = nlmsvc_grant_deferred, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 9e4d6aab611..ad478da7ca6 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(nlm_srcaddr(host), datap); + return rpc_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/fs/locks.c b/fs/locks.c index 19ee18a6829..a8794f233bc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try) return fl->fl_file == try->fl_file; } -static struct lock_manager_operations lease_manager_ops = { +static const struct lock_manager_operations lease_manager_ops = { .fl_break = lease_break_callback, .fl_release_private = lease_release_private_callback, .fl_mylease = lease_mylease_callback, diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e5a2dac5f71..76b0aa0f73b 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr, p = read_buf(xdr, len); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE);; + return htonl(NFS4ERR_RESOURCE); memcpy(sid->data, p, len); return 0; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1434080aefe..2ef4fecf398 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl) nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); } -static struct file_lock_operations nfs4_fl_lock_ops = { +static const struct file_lock_operations nfs4_fl_lock_ops = { .fl_copy_lock = nfs4_fl_copy_lock, .fl_release_private = nfs4_fl_release_lock, }; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index d9462643155..984a5ebcc1d 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1341,6 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) if (rv) goto out; rv = check_nfsd_access(exp, rqstp); + if (rv) + fh_put(fhp); out: exp_put(exp); return rv; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 01d4ec1c88e..edf926e1062 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, return p; } -static __be32 * -encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, - struct svc_fh *fhp) -{ - p = encode_post_op_attr(cd->rqstp, p, fhp); - *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, fhp); - fh_put(fhp); - return p; -} - static int compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, const char *name, int namlen) @@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, dparent = cd->fh.fh_dentry; exp = cd->fh.fh_export; - fh_init(fhp, NFS3_FHSIZE); if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); if (dchild == dparent) { /* filesystem root - cannot return filehandle for ".." */ dput(dchild); - return 1; + return -ENOENT; } } else dchild = dget(dparent); } else dchild = lookup_one_len(name, dparent, namlen); if (IS_ERR(dchild)) - return 1; - if (d_mountpoint(dchild) || - fh_compose(fhp, exp, dchild, &cd->fh) != 0 || - !dchild->d_inode) - rv = 1; + return -ENOENT; + rv = -ENOENT; + if (d_mountpoint(dchild)) + goto out; + rv = fh_compose(fhp, exp, dchild, &cd->fh); + if (rv) + goto out; + if (!dchild->d_inode) + goto out; + rv = 0; +out: dput(dchild); return rv; } +__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +{ + struct svc_fh fh; + int err; + + fh_init(&fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, &fh, name, namlen); + if (err) { + *p++ = 0; + *p++ = 0; + goto out; + } + p = encode_post_op_attr(cd->rqstp, p, &fh); + *p++ = xdr_one; /* yes, a file handle follows */ + p = encode_fh(p, &fh); +out: + fh_put(&fh); + return p; +} + /* * Encode a directory entry. This one works for both normal readdir * and readdirplus. @@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p = encode_entry_baggage(cd, p, name, namlen, ino); - /* throw in readdirplus baggage */ - if (plus) { - struct svc_fh fh; - - if (compose_entry_fh(cd, &fh, name, namlen) > 0) { - *p++ = 0; - *p++ = 0; - } else - p = encode_entryplus_baggage(cd, p, &fh); - } + if (plus) + p = encode_entryplus_baggage(cd, p, name, namlen); num_entry_words = p - cd->buffer; } else if (cd->rqstp->rq_respages[pn+1] != NULL) { /* temporarily encode entry into next page, then move back to @@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); - /* throw in readdirplus baggage */ - if (plus) { - struct svc_fh fh; - - if (compose_entry_fh(cd, &fh, name, namlen) > 0) { - /* zero out the filehandle */ - *p1++ = 0; - *p1++ = 0; - } else - p1 = encode_entryplus_baggage(cd, p1, &fh); - } + if (plus) + p = encode_entryplus_baggage(cd, p1, name, namlen); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 54b8b4140c8..725d02f210e 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, deny = ~pas.group & pas.other; if (deny) { ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; - ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->flag = eflag; ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_GROUP; ace++; @@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, if (deny) { ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; - ace->access_mask = mask_from_posix(deny, flags); + ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_NAMED; ace->who = pa->e_id; ace++; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3fd23f7acec..24e8d78f8dd 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -43,25 +43,30 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/svcsock.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/state.h> #include <linux/sunrpc/sched.h> #include <linux/nfs4.h> +#include <linux/sunrpc/xprtsock.h> #define NFSDDBG_FACILITY NFSDDBG_PROC #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 +#define NFS4_STATEID_SIZE 16 /* Index of predefined Linux callback client operations */ enum { - NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, }; enum nfs_cb_opnum4 { OP_CB_RECALL = 4, + OP_CB_SEQUENCE = 11, }; #define NFS4_MAXTAGLEN 20 @@ -70,17 +75,29 @@ enum nfs_cb_opnum4 { #define NFS4_dec_cb_null_sz 0 #define cb_compound_enc_hdr_sz 4 #define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + #define op_enc_sz 1 #define op_dec_sz 2 #define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) #define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) #define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ 1 + enc_stateid_sz + \ enc_nfs4_fh_sz) #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ op_dec_sz) +struct nfs4_rpc_args { + void *args_op; + struct nfsd4_cb_sequence args_seq; +}; + /* * Generic encode routines from fs/nfs/nfs4xdr.c */ @@ -137,11 +154,13 @@ xdr_error: \ } while (0) struct nfs4_cb_compound_hdr { - int status; - u32 ident; + /* args */ + u32 ident; /* minorversion 0 only */ u32 nops; __be32 *nops_p; u32 minorversion; + /* res */ + int status; u32 taglen; char *tag; }; @@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, hdr->nops++; } +static void +encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + if (hdr->minorversion == 0) + return; + + RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); + + WRITE32(OP_CB_SEQUENCE); + WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(args->cbs_clp->cl_cb_seq_nr); + WRITE32(0); /* slotid, always 0 */ + WRITE32(0); /* highest slotid always 0 */ + WRITE32(0); /* cachethis always 0 */ + WRITE32(0); /* FIXME: support referring_call_lists */ + hdr->nops++; +} + static int nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) { @@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) } static int -nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args) +nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, + struct nfs4_rpc_args *rpc_args) { struct xdr_stream xdr; + struct nfs4_delegation *args = rpc_args->args_op; struct nfs4_cb_compound_hdr hdr = { .ident = args->dl_ident, + .minorversion = rpc_args->args_seq.cbs_minorversion, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_cb_compound_hdr(&xdr, &hdr); + encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); encode_cb_recall(&xdr, args, &hdr); encode_cb_nops(&hdr); return 0; @@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) return 0; } +/* + * Our current back channel implmentation supports a single backchannel + * with a single slot. + */ +static int +decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, + struct rpc_rqst *rqstp) +{ + struct nfs4_sessionid id; + int status; + u32 dummy; + __be32 *p; + + if (res->cbs_minorversion == 0) + return 0; + + status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); + if (status) + return status; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + status = -ESERVERFAULT; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); + p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); + if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, + NFS4_MAX_SESSIONID_LEN)) { + dprintk("%s Invalid session id\n", __func__); + goto out; + } + READ32(dummy); + if (dummy != res->cbs_clp->cl_cb_seq_nr) { + dprintk("%s Invalid sequence number\n", __func__); + goto out; + } + READ32(dummy); /* slotid must be 0 */ + if (dummy != 0) { + dprintk("%s Invalid slotid\n", __func__); + goto out; + } + /* FIXME: process highest slotid and target highest slotid */ + status = 0; +out: + return status; +} + + static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) { @@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) } static int -nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) +nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, + struct nfsd4_cb_sequence *seq) { struct xdr_stream xdr; struct nfs4_cb_compound_hdr hdr; @@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) status = decode_cb_compound_hdr(&xdr, &hdr); if (status) goto out; + if (seq) { + status = decode_cb_sequence(&xdr, seq, rqstp); + if (status) + goto out; + } status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); out: return status; @@ -377,16 +478,15 @@ static int max_cb_time(void) int setup_callback_client(struct nfs4_client *clp) { - struct sockaddr_in addr; struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_timeout timeparms = { .to_initval = max_cb_time(), .to_retries = 0, }; struct rpc_create_args args = { - .protocol = IPPROTO_TCP, - .address = (struct sockaddr *)&addr, - .addrsize = sizeof(addr), + .protocol = XPRT_TRANSPORT_TCP, + .address = (struct sockaddr *) &cb->cb_addr, + .addrsize = cb->cb_addrlen, .timeout = &timeparms, .program = &cb_program, .prognumber = cb->cb_prog, @@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp) if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; - - /* Initialize address */ - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(cb->cb_port); - addr.sin_addr.s_addr = htonl(cb->cb_addr); - + if (cb->cb_minorversion) { + args.bc_xprt = clp->cl_cb_xprt; + args.protocol = XPRT_TRANSPORT_BC_TCP; + } /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { @@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { .rpc_call_done = nfsd4_cb_probe_done, }; -static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb) -{ - struct auth_cred acred = { - .machine_cred = 1 - }; +static struct rpc_cred *callback_cred; - /* - * Note in the gss case this doesn't actually have to wait for a - * gss upcall (or any calls to the client); this just creates a - * non-uptodate cred which the rpc state machine will fill in with - * a refresh_upcall later. - */ - return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred, - RPCAUTH_LOOKUP_NEW); +int set_callback_cred(void) +{ + callback_cred = rpc_lookup_machine_cred(); + if (!callback_cred) + return -ENOMEM; + return 0; } + void do_probe_callback(struct nfs4_client *clp) { struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, + .rpc_cred = callback_cred }; - struct rpc_cred *cred; int status; - cred = lookup_cb_cred(cb); - if (IS_ERR(cred)) { - status = PTR_ERR(cred); - goto out; - } - cb->cb_cred = cred; - msg.rpc_cred = cb->cb_cred; status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, &nfsd4_cb_probe_ops, (void *)clp); -out: if (status) { warn_no_callback_path(clp, status); put_nfs4_client(clp); @@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp) do_probe_callback(clp); } +/* + * There's currently a single callback channel slot. + * If the slot is available, then mark it busy. Otherwise, set the + * thread for sleeping on the callback RPC wait queue. + */ +static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, + struct rpc_task *task) +{ + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 *ptr = (u32 *)clp->cl_sessionid.data; + int status = 0; + + dprintk("%s: %u:%u:%u:%u\n", __func__, + ptr[0], ptr[1], ptr[2], ptr[3]); + + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); + dprintk("%s slot is busy\n", __func__); + status = -EAGAIN; + goto out; + } + + /* + * We'll need the clp during XDR encoding and decoding, + * and the sequence during decoding to verify the reply + */ + args->args_seq.cbs_clp = clp; + task->tk_msg.rpc_resp = &args->args_seq; + +out: + dprintk("%s status=%d\n", __func__, status); + return status; +} + +/* + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; + + args->args_seq.cbs_minorversion = minorversion; + if (minorversion) { + status = nfsd41_cb_setup_sequence(clp, task); + if (status) { + if (status != -EAGAIN) { + /* terminate rpc task */ + task->tk_status = status; + task->tk_action = NULL; + } + return; + } + } + rpc_call_start(task); +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; + + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + + if (clp->cl_cb_conn.cb_minorversion) { + /* No need for lock, access serialized in nfsd4_cb_prepare */ + ++clp->cl_cb_seq_nr; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_seq_nr); + + /* We're done looking into the sequence information */ + task->tk_msg.rpc_resp = NULL; + } +} + static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfs4_delegation *dp = calldata; struct nfs4_client *clp = dp->dl_client; + nfsd4_cb_done(task, calldata); + switch (task->tk_status) { case -EIO: /* Network partition? */ @@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) break; default: /* success, or error we can't handle */ - return; + goto done; } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; rpc_restart_call(task); + return; } else { atomic_set(&clp->cl_cb_conn.cb_set, 0); warn_no_callback_path(clp, task->tk_status); } +done: + kfree(task->tk_msg.rpc_argp); } static void nfsd4_cb_recall_release(void *calldata) @@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata) } static const struct rpc_call_ops nfsd4_cb_recall_ops = { + .rpc_call_prepare = nfsd4_cb_prepare, .rpc_call_done = nfsd4_cb_recall_done, .rpc_release = nfsd4_cb_recall_release, }; @@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_client; struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; + struct nfs4_rpc_args *args; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], - .rpc_argp = dp, - .rpc_cred = clp->cl_cb_conn.cb_cred + .rpc_cred = callback_cred }; - int status; + int status = -ENOMEM; + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) + goto out; + args->args_op = dp; + msg.rpc_argp = args; dp->dl_retries = 1; status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); +out: if (status) { + kfree(args); put_nfs4_client(clp); nfs4_put_delegation(dp); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7c8801769a3..bebc0c2e1b0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -68,7 +68,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u32 *bmval, u32 *writable) { struct dentry *dentry = cstate->current_fh.fh_dentry; - struct svc_export *exp = cstate->current_fh.fh_export; /* * Check about attributes are supported by the NFSv4 server or not. @@ -80,17 +79,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_attrnotsupp; /* - * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported + * Check FATTR4_WORD0_ACL can be supported * in current environment or not. */ if (bmval[0] & FATTR4_WORD0_ACL) { if (!IS_POSIXACL(dentry->d_inode)) return nfserr_attrnotsupp; } - if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) { - if (exp->ex_fslocs.locations == NULL) - return nfserr_attrnotsupp; - } /* * According to spec, read-only attributes return ERR_INVAL. @@ -123,6 +118,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp, return status; } +static int +is_create_with_attrs(struct nfsd4_open *open) +{ + return open->op_create == NFS4_OPEN_CREATE + && (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED + || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1); +} + +/* + * if error occurs when setting the acl, just clear the acl bit + * in the returned attr bitmap. + */ +static void +do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl, u32 *bmval) +{ + __be32 status; + + status = nfsd4_set_nfs4_acl(rqstp, fhp, acl); + if (status) + /* + * We should probably fail the whole open at this point, + * but we've already created the file, so it's too late; + * So this seems the least of evils: + */ + bmval[0] &= ~FATTR4_WORD0_ACL; +} + static inline void fh_dup2(struct svc_fh *dst, struct svc_fh *src) { @@ -206,6 +230,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o if (status) goto out; + if (is_create_with_attrs(open) && open->op_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval); + set_change_info(&open->op_cinfo, current_fh); fh_dup2(current_fh, &resfh); @@ -536,12 +563,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_badtype; } - if (!status) { - fh_unlock(&cstate->current_fh); - set_change_info(&create->cr_cinfo, &cstate->current_fh); - fh_dup2(&cstate->current_fh, &resfh); - } + if (status) + goto out; + + if (create->cr_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, + create->cr_bmval); + fh_unlock(&cstate->current_fh); + set_change_info(&create->cr_cinfo, &cstate->current_fh); + fh_dup2(&cstate->current_fh, &resfh); +out: fh_put(&resfh); return status; } @@ -947,34 +979,6 @@ static struct nfsd4_operation nfsd4_ops[]; static const char *nfsd4_op_name(unsigned opnum); /* - * This is a replay of a compound for which no cache entry pages - * were used. Encode the sequence operation, and if cachethis is FALSE - * encode the uncache rep error on the next operation. - */ -static __be32 -nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args, - struct nfsd4_compoundres *resp) -{ - struct nfsd4_op *op; - - dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__, - resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis); - - /* Encode the replayed sequence operation */ - BUG_ON(resp->opcnt != 1); - op = &args->ops[resp->opcnt - 1]; - nfsd4_encode_operation(resp, op); - - /*return nfserr_retry_uncached_rep in next operation. */ - if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) { - op = &args->ops[resp->opcnt++]; - op->status = nfserr_retry_uncached_rep; - nfsd4_encode_operation(resp, op); - } - return op->status; -} - -/* * Enforce NFSv4.1 COMPOUND ordering rules. * * TODO: @@ -1083,13 +1087,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, BUG_ON(op->status == nfs_ok); encode_op: - /* Only from SEQUENCE or CREATE_SESSION */ + /* Only from SEQUENCE */ if (resp->cstate.status == nfserr_replay_cache) { dprintk("%s NFS4.1 replay from cache\n", __func__); - if (nfsd4_not_cached(resp)) - status = nfsd4_enc_uncached_replay(args, resp); - else - status = op->status; + status = op->status; goto out; } if (op->status == nfserr_replay_me) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 980a216a48c..2153f9bdbeb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -55,6 +55,7 @@ #include <linux/lockd/bind.h> #include <linux/module.h> #include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/clnt.h> #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -413,36 +414,65 @@ gen_sessionid(struct nfsd4_session *ses) } /* - * Give the client the number of slots it requests bound by - * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. + * The protocol defines ca_maxresponssize_cached to include the size of + * the rpc header, but all we need to cache is the data starting after + * the end of the initial SEQUENCE operation--the rest we regenerate + * each time. Therefore we can advertise a ca_maxresponssize_cached + * value that is the number of bytes in our cache plus a few additional + * bytes. In order to stay on the safe side, and not promise more than + * we can cache, those additional bytes must be the minimum possible: 24 + * bytes of rpc header (xid through accept state, with AUTH_NULL + * verifier), 12 for the compound header (with zero-length tag), and 44 + * for the SEQUENCE op response: + */ +#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) + +/* + * Give the client the number of ca_maxresponsesize_cached slots it + * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, + * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more + * than NFSD_MAX_SLOTS_PER_SESSION. * - * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we - * should (up to a point) re-negotiate active sessions and reduce their - * slot usage to make rooom for new connections. For now we just fail the - * create session. + * If we run out of reserved DRC memory we should (up to a point) + * re-negotiate active sessions and reduce their slot usage to make + * rooom for new connections. For now we just fail the create session. */ -static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) +static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) { - int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; + int mem, size = fchan->maxresp_cached; if (fchan->maxreqs < 1) return nfserr_inval; - else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) - fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - spin_lock(&nfsd_serv->sv_lock); - if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) - np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; - nfsd_serv->sv_drc_pages_used += np; - spin_unlock(&nfsd_serv->sv_lock); + if (size < NFSD_MIN_HDR_SEQ_SZ) + size = NFSD_MIN_HDR_SEQ_SZ; + size -= NFSD_MIN_HDR_SEQ_SZ; + if (size > NFSD_SLOT_CACHE_SIZE) + size = NFSD_SLOT_CACHE_SIZE; + + /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */ + mem = fchan->maxreqs * size; + if (mem > NFSD_MAX_MEM_PER_SESSION) { + fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size; + if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) + fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; + mem = fchan->maxreqs * size; + } - if (np <= 0) { - status = nfserr_resource; - fchan->maxreqs = 0; - } else - fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; + spin_lock(&nfsd_drc_lock); + /* bound the total session drc memory ussage */ + if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) { + fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size; + mem = fchan->maxreqs * size; + } + nfsd_drc_mem_used += mem; + spin_unlock(&nfsd_drc_lock); - return status; + if (fchan->maxreqs == 0) + return nfserr_serverfault; + + fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; + return 0; } /* @@ -466,36 +496,41 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp, fchan->maxresp_sz = maxcount; session_fchan->maxresp_sz = fchan->maxresp_sz; - /* Set the max response cached size our default which is - * a multiple of PAGE_SIZE and small */ - session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE; - fchan->maxresp_cached = session_fchan->maxresp_cached; - /* Use the client's maxops if possible */ if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; session_fchan->maxops = fchan->maxops; - /* try to use the client requested number of slots */ - if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) - fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - /* FIXME: Error means no more DRC pages so the server should * recover pages from existing sessions. For now fail session * creation. */ - status = set_forechannel_maxreqs(fchan); + status = set_forechannel_drc_size(fchan); + session_fchan->maxresp_cached = fchan->maxresp_cached; session_fchan->maxreqs = fchan->maxreqs; + + dprintk("%s status %d\n", __func__, status); return status; } +static void +free_session_slots(struct nfsd4_session *ses) +{ + int i; + + for (i = 0; i < ses->se_fchannel.maxreqs; i++) + kfree(ses->se_slots[i]); +} + static int alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) { struct nfsd4_session *new, tmp; - int idx, status = nfserr_resource, slotsize; + struct nfsd4_slot *sp; + int idx, slotsize, cachesize, i; + int status; memset(&tmp, 0, sizeof(tmp)); @@ -506,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, if (status) goto out; - /* allocate struct nfsd4_session and slot table in one piece */ - slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot); + BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) + + sizeof(struct nfsd4_session) > PAGE_SIZE); + + status = nfserr_serverfault; + /* allocate struct nfsd4_session and slot table pointers in one piece */ + slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); if (!new) goto out; memcpy(new, &tmp, sizeof(*new)); + /* allocate each struct nfsd4_slot and data cache in one piece */ + cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + for (i = 0; i < new->se_fchannel.maxreqs; i++) { + sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); + if (!sp) + goto out_free; + new->se_slots[i] = sp; + } + new->se_client = clp; gen_sessionid(new); idx = hash_sessionid(&new->se_sessionid); @@ -530,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, status = nfs_ok; out: return status; +out_free: + free_session_slots(new); + kfree(new); + goto out; } /* caller must hold sessionid_lock */ @@ -572,19 +624,16 @@ release_session(struct nfsd4_session *ses) nfsd4_put_session(ses); } -static void nfsd4_release_respages(struct page **respages, short resused); - void free_session(struct kref *kref) { struct nfsd4_session *ses; - int i; ses = container_of(kref, struct nfsd4_session, se_ref); - for (i = 0; i < ses->se_fchannel.maxreqs; i++) { - struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; - nfsd4_release_respages(e->ce_respages, e->ce_resused); - } + spin_lock(&nfsd_drc_lock); + nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE; + spin_unlock(&nfsd_drc_lock); + free_session_slots(ses); kfree(ses); } @@ -647,18 +696,14 @@ shutdown_callback_client(struct nfs4_client *clp) clp->cl_cb_conn.cb_client = NULL; rpc_shutdown_client(clnt); } - if (clp->cl_cb_conn.cb_cred) { - put_rpccred(clp->cl_cb_conn.cb_cred); - clp->cl_cb_conn.cb_cred = NULL; - } } static inline void free_client(struct nfs4_client *clp) { shutdown_callback_client(clp); - nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, - clp->cl_slot.sl_cache_entry.ce_resused); + if (clp->cl_cb_xprt) + svc_xprt_put(clp->cl_cb_xprt); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -714,25 +759,6 @@ expire_client(struct nfs4_client *clp) put_nfs4_client(clp); } -static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir) -{ - struct nfs4_client *clp; - - clp = alloc_client(name); - if (clp == NULL) - return NULL; - memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); - atomic_set(&clp->cl_count, 1); - atomic_set(&clp->cl_cb_conn.cb_set, 0); - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_strhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_sessions); - INIT_LIST_HEAD(&clp->cl_lru); - return clp; -} - static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) { memcpy(target->cl_verifier.data, source->data, @@ -795,6 +821,46 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } +static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, + struct svc_rqst *rqstp, nfs4_verifier *verf) +{ + struct nfs4_client *clp; + struct sockaddr *sa = svc_addr(rqstp); + char *princ; + + clp = alloc_client(name); + if (clp == NULL) + return NULL; + + princ = svc_gss_principal(rqstp); + if (princ) { + clp->cl_principal = kstrdup(princ, GFP_KERNEL); + if (clp->cl_principal == NULL) { + free_client(clp); + return NULL; + } + } + + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); + atomic_set(&clp->cl_count, 1); + atomic_set(&clp->cl_cb_conn.cb_set, 0); + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + copy_verf(clp, verf); + rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); + clp->cl_flavor = rqstp->rq_flavor; + copy_cred(&clp->cl_cred, &rqstp->rq_cred); + gen_confirm(clp); + + return clp; +} + static int check_name(struct xdr_netobj name) { if (name.len == 0) @@ -902,93 +968,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, return NULL; } -/* a helper function for parse_callback */ -static int -parse_octet(unsigned int *lenp, char **addrp) -{ - unsigned int len = *lenp; - char *p = *addrp; - int n = -1; - char c; - - for (;;) { - if (!len) - break; - len--; - c = *p++; - if (c == '.') - break; - if ((c < '0') || (c > '9')) { - n = -1; - break; - } - if (n < 0) - n = 0; - n = (n * 10) + (c - '0'); - if (n > 255) { - n = -1; - break; - } - } - *lenp = len; - *addrp = p; - return n; -} - -/* parse and set the setclientid ipv4 callback address */ -static int -parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp) -{ - int temp = 0; - u32 cbaddr = 0; - u16 cbport = 0; - u32 addrlen = addr_len; - char *addr = addr_val; - int i, shift; - - /* ipaddress */ - shift = 24; - for(i = 4; i > 0 ; i--) { - if ((temp = parse_octet(&addrlen, &addr)) < 0) { - return 0; - } - cbaddr |= (temp << shift); - if (shift > 0) - shift -= 8; - } - *cbaddrp = cbaddr; - - /* port */ - shift = 8; - for(i = 2; i > 0 ; i--) { - if ((temp = parse_octet(&addrlen, &addr)) < 0) { - return 0; - } - cbport |= (temp << shift); - if (shift > 0) - shift -= 8; - } - *cbportp = cbport; - return 1; -} - static void -gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) { struct nfs4_cb_conn *cb = &clp->cl_cb_conn; - - /* Currently, we only support tcp for the callback channel */ - if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) + unsigned short expected_family; + + /* Currently, we only support tcp and tcp6 for the callback channel */ + if (se->se_callback_netid_len == 3 && + !memcmp(se->se_callback_netid_val, "tcp", 3)) + expected_family = AF_INET; + else if (se->se_callback_netid_len == 4 && + !memcmp(se->se_callback_netid_val, "tcp6", 4)) + expected_family = AF_INET6; + else goto out_err; - if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, - &cb->cb_addr, &cb->cb_port))) + cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, + se->se_callback_addr_len, + (struct sockaddr *) &cb->cb_addr, + sizeof(cb->cb_addr)); + + if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) goto out_err; + + if (cb->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; + cb->cb_minorversion = 0; cb->cb_prog = se->se_callback_prog; cb->cb_ident = se->se_callback_ident; return; out_err: + cb->cb_addr.ss_family = AF_UNSPEC; + cb->cb_addrlen = 0; dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -996,175 +1009,87 @@ out_err: return; } -void -nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) -{ - struct nfsd4_compoundres *resp = rqstp->rq_resp; - - resp->cstate.statp = statp; -} - /* - * Dereference the result pages. + * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. */ -static void -nfsd4_release_respages(struct page **respages, short resused) +void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { - int i; + struct nfsd4_slot *slot = resp->cstate.slot; + unsigned int base; - dprintk("--> %s\n", __func__); - for (i = 0; i < resused; i++) { - if (!respages[i]) - continue; - put_page(respages[i]); - respages[i] = NULL; - } -} + dprintk("--> %s slot %p\n", __func__, slot); -static void -nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) -{ - int i; + slot->sl_opcnt = resp->opcnt; + slot->sl_status = resp->cstate.status; - for (i = 0; i < count; i++) { - topages[i] = frompages[i]; - if (!topages[i]) - continue; - get_page(topages[i]); + if (nfsd4_not_cached(resp)) { + slot->sl_datalen = 0; + return; } + slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap; + base = (char *)resp->cstate.datap - + (char *)resp->xbuf->head[0].iov_base; + if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data, + slot->sl_datalen)) + WARN("%s: sessions DRC could not cache compound\n", __func__); + return; } /* - * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous - * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total - * length of the XDR response is less than se_fmaxresp_cached - * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a - * of the reply (e.g. readdir). + * Encode the replay sequence operation from the slot values. + * If cachethis is FALSE encode the uncached rep error on the next + * operation which sets resp->p and increments resp->opcnt for + * nfs4svc_encode_compoundres. * - * Store the base and length of the rq_req.head[0] page - * of the NFSv4.1 data, just past the rpc header. */ -void -nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +static __be32 +nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) { - struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; - struct svc_rqst *rqstp = resp->rqstp; - struct nfsd4_compoundargs *args = rqstp->rq_argp; - struct nfsd4_op *op = &args->ops[resp->opcnt]; - struct kvec *resv = &rqstp->rq_res.head[0]; - - dprintk("--> %s entry %p\n", __func__, entry); - - /* Don't cache a failed OP_SEQUENCE. */ - if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status) - return; + struct nfsd4_op *op; + struct nfsd4_slot *slot = resp->cstate.slot; - nfsd4_release_respages(entry->ce_respages, entry->ce_resused); - entry->ce_opcnt = resp->opcnt; - entry->ce_status = resp->cstate.status; + dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__, + resp->opcnt, resp->cstate.slot->sl_cachethis); - /* - * Don't need a page to cache just the sequence operation - the slot - * does this for us! - */ + /* Encode the replayed sequence operation */ + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); - if (nfsd4_not_cached(resp)) { - entry->ce_resused = 0; - entry->ce_rpchdrlen = 0; - dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, - resp->cstate.slot->sl_cache_entry.ce_cachethis); - return; - } - entry->ce_resused = rqstp->rq_resused; - if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1) - entry->ce_resused = NFSD_PAGES_PER_SLOT + 1; - nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages, - entry->ce_resused); - entry->ce_datav.iov_base = resp->cstate.statp; - entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp - - (char *)page_address(rqstp->rq_respages[0])); - /* Current request rpc header length*/ - entry->ce_rpchdrlen = (char *)resp->cstate.statp - - (char *)page_address(rqstp->rq_respages[0]); -} - -/* - * We keep the rpc header, but take the nfs reply from the replycache. - */ -static int -nfsd41_copy_replay_data(struct nfsd4_compoundres *resp, - struct nfsd4_cache_entry *entry) -{ - struct svc_rqst *rqstp = resp->rqstp; - struct kvec *resv = &resp->rqstp->rq_res.head[0]; - int len; - - /* Current request rpc header length*/ - len = (char *)resp->cstate.statp - - (char *)page_address(rqstp->rq_respages[0]); - if (entry->ce_datav.iov_len + len > PAGE_SIZE) { - dprintk("%s v41 cached reply too large (%Zd).\n", __func__, - entry->ce_datav.iov_len); - return 0; + /* Return nfserr_retry_uncached_rep in next operation. */ + if (args->opcnt > 1 && slot->sl_cachethis == 0) { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); } - /* copy the cached reply nfsd data past the current rpc header */ - memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base, - entry->ce_datav.iov_len); - resv->iov_len = len + entry->ce_datav.iov_len; - return 1; + return op->status; } /* - * Keep the first page of the replay. Copy the NFSv4.1 data from the first - * cached page. Replace any futher replay pages from the cache. + * The sequence operation is not cached because we can use the slot and + * session values. */ __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { - struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; + struct nfsd4_slot *slot = resp->cstate.slot; __be32 status; - dprintk("--> %s entry %p\n", __func__, entry); - - /* - * If this is just the sequence operation, we did not keep - * a page in the cache entry because we can just use the - * slot info stored in struct nfsd4_sequence that was checked - * against the slot in nfsd4_sequence(). - * - * This occurs when seq->cachethis is FALSE, or when the client - * session inactivity timer fires and a solo sequence operation - * is sent (lease renewal). - */ - if (seq && nfsd4_not_cached(resp)) { - seq->maxslots = resp->cstate.session->se_fchannel.maxreqs; - return nfs_ok; - } - - if (!nfsd41_copy_replay_data(resp, entry)) { - /* - * Not enough room to use the replay rpc header, send the - * cached header. Release all the allocated result pages. - */ - svc_free_res_pages(resp->rqstp); - nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages, - entry->ce_resused); - } else { - /* Release all but the first allocated result page */ + dprintk("--> %s slot %p\n", __func__, slot); - resp->rqstp->rq_resused--; - svc_free_res_pages(resp->rqstp); + /* Either returns 0 or nfserr_retry_uncached */ + status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); + if (status == nfserr_retry_uncached_rep) + return status; - nfsd4_copy_pages(&resp->rqstp->rq_respages[1], - &entry->ce_respages[1], - entry->ce_resused - 1); - } + /* The sequence operation has been encoded, cstate->datap set. */ + memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen); - resp->rqstp->rq_resused = entry->ce_resused; - resp->opcnt = entry->ce_opcnt; - resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; - status = entry->ce_status; + resp->opcnt = slot->sl_opcnt; + resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen); + status = slot->sl_status; return status; } @@ -1194,13 +1119,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, int status; unsigned int strhashval; char dname[HEXDIR_LEN]; + char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; - u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + struct sockaddr *sa = svc_addr(rqstp); + rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " - " ip_addr=%u flags %x, spa_how %d\n", + "ip_addr=%s flags %x, spa_how %d\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, - ip_addr, exid->flags, exid->spa_how); + addr_str, exid->flags, exid->spa_how); if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) return nfserr_inval; @@ -1281,28 +1208,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, out_new: /* Normal case */ - new = create_client(exid->clname, dname); + new = create_client(exid->clname, dname, rqstp, &verf); if (new == NULL) { - status = nfserr_resource; + status = nfserr_serverfault; goto out; } - copy_verf(new, &verf); - copy_cred(&new->cl_cred, &rqstp->rq_cred); - new->cl_addr = ip_addr; gen_clid(new); - gen_confirm(new); add_to_unconfirmed(new, strhashval); out_copy: exid->clientid.cl_boot = new->cl_clientid.cl_boot; exid->clientid.cl_id = new->cl_clientid.cl_id; - new->cl_slot.sl_seqid = 0; exid->seqid = 1; nfsd4_set_ex_flags(new, exid); dprintk("nfsd4_exchange_id seqid %d flags %x\n", - new->cl_slot.sl_seqid, new->cl_exchange_flags); + new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); status = nfs_ok; out: @@ -1313,40 +1235,60 @@ error: } static int -check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) +check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) { - dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, - slot->sl_seqid); + dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, + slot_seqid); /* The slot is in use, and no response has been sent. */ - if (slot->sl_inuse) { - if (seqid == slot->sl_seqid) + if (slot_inuse) { + if (seqid == slot_seqid) return nfserr_jukebox; else return nfserr_seq_misordered; } /* Normal */ - if (likely(seqid == slot->sl_seqid + 1)) + if (likely(seqid == slot_seqid + 1)) return nfs_ok; /* Replay */ - if (seqid == slot->sl_seqid) + if (seqid == slot_seqid) return nfserr_replay_cache; /* Wraparound */ - if (seqid == 1 && (slot->sl_seqid + 1) == 0) + if (seqid == 1 && (slot_seqid + 1) == 0) return nfs_ok; /* Misordered replay or misordered new request */ return nfserr_seq_misordered; } +/* + * Cache the create session result into the create session single DRC + * slot cache by saving the xdr structure. sl_seqid has been set. + * Do this for solo or embedded create session operations. + */ +static void +nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot, int nfserr) +{ + slot->sl_status = nfserr; + memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); +} + +static __be32 +nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot) +{ + memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); + return slot->sl_status; +} + __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_create_session *cr_ses) { - u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; - struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; - struct nfsd4_slot *slot = NULL; + struct nfsd4_clid_slot *cs_slot = NULL; int status = 0; nfs4_lock_state(); @@ -1354,40 +1296,38 @@ nfsd4_create_session(struct svc_rqst *rqstp, conf = find_confirmed_client(&cr_ses->clientid); if (conf) { - slot = &conf->cl_slot; - status = check_slot_seqid(cr_ses->seqid, slot); + cs_slot = &conf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status == nfserr_replay_cache) { dprintk("Got a create_session replay! seqid= %d\n", - slot->sl_seqid); - cstate->slot = slot; - cstate->status = status; + cs_slot->sl_seqid); /* Return the cached reply status */ - status = nfsd4_replay_cache_entry(resp, NULL); + status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out; - } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { + } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { status = nfserr_seq_misordered; dprintk("Sequence misordered!\n"); dprintk("Expected seqid= %d but got seqid= %d\n", - slot->sl_seqid, cr_ses->seqid); + cs_slot->sl_seqid, cr_ses->seqid); goto out; } - conf->cl_slot.sl_seqid++; + cs_slot->sl_seqid++; } else if (unconf) { if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || - (ip_addr != unconf->cl_addr)) { + !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { status = nfserr_clid_inuse; goto out; } - slot = &unconf->cl_slot; - status = check_slot_seqid(cr_ses->seqid, slot); + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { /* an unconfirmed replay returns misordered */ status = nfserr_seq_misordered; - goto out; + goto out_cache; } - slot->sl_seqid++; /* from 0 to 1 */ + cs_slot->sl_seqid++; /* from 0 to 1 */ move_to_confirmed(unconf); /* @@ -1396,6 +1336,19 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_PERSIST; cr_ses->flags &= ~SESSION4_RDMA; + if (cr_ses->flags & SESSION4_BACK_CHAN) { + unconf->cl_cb_xprt = rqstp->rq_xprt; + svc_xprt_get(unconf->cl_cb_xprt); + rpc_copy_addr( + (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, + sa); + unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + unconf->cl_cb_conn.cb_minorversion = + cstate->minorversion; + unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; + unconf->cl_cb_seq_nr = 1; + nfsd4_probe_callback(unconf); + } conf = unconf; } else { status = nfserr_stale_clientid; @@ -1408,12 +1361,11 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); - cr_ses->seqid = slot->sl_seqid; + cr_ses->seqid = cs_slot->sl_seqid; - slot->sl_inuse = true; - cstate->slot = slot; - /* Ensure a page is used for the cache */ - slot->sl_cache_entry.ce_cachethis = 1; +out_cache: + /* cache solo and embedded create sessions under the state lock */ + nfsd4_cache_create_session(cr_ses, cs_slot, status); out: nfs4_unlock_state(); dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -1478,18 +1430,23 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (seq->slotid >= session->se_fchannel.maxreqs) goto out; - slot = &session->se_slots[seq->slotid]; + slot = session->se_slots[seq->slotid]; dprintk("%s: slotid %d\n", __func__, seq->slotid); - status = check_slot_seqid(seq->seqid, slot); + /* We do not negotiate the number of slots yet, so set the + * maxslots to the session maxreqs which is used to encode + * sr_highest_slotid and the sr_target_slot id to maxslots */ + seq->maxslots = session->se_fchannel.maxreqs; + + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse); if (status == nfserr_replay_cache) { cstate->slot = slot; cstate->session = session; /* Return the cached reply status and set cstate->status - * for nfsd4_svc_encode_compoundres processing */ + * for nfsd4_proc_compound processing */ status = nfsd4_replay_cache_entry(resp, seq); cstate->status = nfserr_replay_cache; - goto replay_cache; + goto out; } if (status) goto out; @@ -1497,23 +1454,23 @@ nfsd4_sequence(struct svc_rqst *rqstp, /* Success! bump slot seqid */ slot->sl_inuse = true; slot->sl_seqid = seq->seqid; - slot->sl_cache_entry.ce_cachethis = seq->cachethis; - /* Always set the cache entry cachethis for solo sequence */ - if (nfsd4_is_solo_sequence(resp)) - slot->sl_cache_entry.ce_cachethis = 1; + slot->sl_cachethis = seq->cachethis; cstate->slot = slot; cstate->session = session; -replay_cache: - /* Renew the clientid on success and on replay. - * Hold a session reference until done processing the compound: + /* Hold a session reference until done processing the compound: * nfsd4_put_session called only if the cstate slot is set. */ - renew_client(session->se_client); nfsd4_get_session(session); out: spin_unlock(&sessionid_lock); + /* Renew the clientid on success and on replay */ + if (cstate->session) { + nfs4_lock_state(); + renew_client(session->se_client); + nfs4_unlock_state(); + } dprintk("%s: return %d\n", __func__, ntohl(status)); return status; } @@ -1522,7 +1479,7 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct sockaddr_in *sin = svc_addr_in(rqstp); + struct sockaddr *sa = svc_addr(rqstp); struct xdr_netobj clname = { .len = setclid->se_namelen, .data = setclid->se_name, @@ -1531,7 +1488,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; - char *princ; char dname[HEXDIR_LEN]; if (!check_name(clname)) @@ -1554,8 +1510,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - dprintk("NFSD: setclientid: string in use by client" - " at %pI4\n", &conf->cl_addr); + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, + sizeof(addr_str)); + dprintk("NFSD: setclientid: string in use by client " + "at %s\n", addr_str); goto out; } } @@ -1573,7 +1532,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ if (unconf) expire_client(unconf); - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; gen_clid(new); @@ -1590,7 +1549,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ expire_client(unconf); } - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; copy_clid(new, conf); @@ -1600,7 +1559,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * probable client reboot; state will be removed if * confirmed. */ - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; gen_clid(new); @@ -1611,25 +1570,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * confirmed. */ expire_client(unconf); - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; gen_clid(new); } - copy_verf(new, &clverifier); - new->cl_addr = sin->sin_addr.s_addr; - new->cl_flavor = rqstp->rq_flavor; - princ = svc_gss_principal(rqstp); - if (princ) { - new->cl_principal = kstrdup(princ, GFP_KERNEL); - if (new->cl_principal == NULL) { - free_client(new); - goto out; - } - } - copy_cred(&new->cl_cred, &rqstp->rq_cred); - gen_confirm(new); - gen_callback(new, setclid); + gen_callback(new, setclid, rpc_get_scope_id(sa)); add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; @@ -1651,7 +1597,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid_confirm *setclientid_confirm) { - struct sockaddr_in *sin = svc_addr_in(rqstp); + struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; @@ -1670,9 +1616,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unconf = find_unconfirmed_client(clid); status = nfserr_clid_inuse; - if (conf && conf->cl_addr != sin->sin_addr.s_addr) + if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa)) goto out; - if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) + if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa)) goto out; /* @@ -2163,7 +2109,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) return -EAGAIN; } -static struct lock_manager_operations nfsd_lease_mng_ops = { +static const struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, .fl_release_private = nfsd_release_deleg_cb, .fl_copy_lock = nfsd_copy_lock_deleg_cb, @@ -3368,7 +3314,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) /* Hack!: For now, we're defining this just so we can use a pointer to it * as a unique cookie to identify our (NFSv4's) posix locks. */ -static struct lock_manager_operations nfsd_posix_mng_ops = { +static const struct lock_manager_operations nfsd_posix_mng_ops = { }; static inline void @@ -4072,7 +4018,7 @@ set_max_delegations(void) /* initialization to perform when the nfsd service is started: */ -static void +static int __nfs4_state_start(void) { unsigned long grace_time; @@ -4084,19 +4030,26 @@ __nfs4_state_start(void) printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) + return -ENOMEM; queue_delayed_work(laundry_wq, &laundromat_work, grace_time); set_max_delegations(); + return set_callback_cred(); } -void +int nfs4_state_start(void) { + int ret; + if (nfs4_init) - return; + return 0; nfsd4_load_reboot_recovery_data(); - __nfs4_state_start(); + ret = __nfs4_state_start(); + if (ret) + return ret; nfs4_init = 1; - return; + return 0; } time_t diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2dcc7feaa6f..0fbd50cee1f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1599,7 +1599,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) { struct svc_fh tmp_fh; - char *path, *rootpath; + char *path = NULL, *rootpath; + size_t rootlen; fh_init(&tmp_fh, NFS4_FHSIZE); *stat = exp_pseudoroot(rqstp, &tmp_fh); @@ -1609,14 +1610,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 * path = exp->ex_pathname; - if (strncmp(path, rootpath, strlen(rootpath))) { + rootlen = strlen(rootpath); + if (strncmp(path, rootpath, rootlen)) { dprintk("nfsd: fs_locations failed;" "%s is not contained in %s\n", path, rootpath); *stat = nfserr_notsupp; - return NULL; + path = NULL; + goto out; } - - return path + strlen(rootpath); + path += rootlen; +out: + fh_put(&tmp_fh); + return path; } /* @@ -1793,11 +1798,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } } - if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - if (exp->ex_fslocs.locations == NULL) { - bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS; - } - } if ((buflen -= 16) < 0) goto out_resource; @@ -1825,8 +1825,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_resource; if (!aclsupport) word0 &= ~FATTR4_WORD0_ACL; - if (!exp->ex_fslocs.locations) - word0 &= ~FATTR4_WORD0_FS_LOCATIONS; if (!word2) { WRITE32(2); WRITE32(word0); @@ -3064,6 +3062,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITE32(0); ADJUST_ARGS(); + resp->cstate.datap = p; /* DRC cache data pointer */ return 0; } @@ -3166,7 +3165,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) return status; session = resp->cstate.session; - if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) + if (session == NULL || slot->sl_cachethis == 0) return status; if (resp->opcnt >= args->opcnt) @@ -3291,6 +3290,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo /* * All that remains is to write the tag and operation count... */ + struct nfsd4_compound_state *cs = &resp->cstate; struct kvec *iov; p = resp->tagp; *p++ = htonl(resp->taglen); @@ -3304,17 +3304,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo iov = &rqstp->rq_res.head[0]; iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; BUG_ON(iov->iov_len > PAGE_SIZE); - if (nfsd4_has_session(&resp->cstate)) { - if (resp->cstate.status == nfserr_replay_cache && - !nfsd4_not_cached(resp)) { - iov->iov_len = resp->cstate.iovlen; - } else { - nfsd4_store_cache_entry(resp); - dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); - resp->cstate.slot->sl_inuse = 0; - } - if (resp->cstate.session) - nfsd4_put_session(resp->cstate.session); + if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); + resp->cstate.slot->sl_inuse = false; + nfsd4_put_session(resp->cstate.session); } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7e906c5b767..00388d2a3c9 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -174,12 +174,13 @@ static const struct file_operations exports_operations = { }; extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); +extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); static struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = nfsd_pool_stats_release, .owner = THIS_MODULE, }; @@ -776,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) size -= len; mesg += len; } - - mutex_unlock(&nfsd_mutex); - return (mesg-buf); - + rv = mesg - buf; out_free: kfree(nthreads); mutex_unlock(&nfsd_mutex); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8847f3fbfc1..01965b2f3a7 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -397,44 +397,51 @@ static inline void _fh_update_old(struct dentry *dentry, fh->ofh_dirino = 0; } -__be32 -fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, - struct svc_fh *ref_fh) +static bool is_root_export(struct svc_export *exp) { - /* ref_fh is a reference file handle. - * if it is non-null and for the same filesystem, then we should compose - * a filehandle which is of the same version, where possible. - * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca - * Then create a 32byte filehandle using nfs_fhbase_old - * - */ + return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root; +} - u8 version; - u8 fsid_type = 0; - struct inode * inode = dentry->d_inode; - struct dentry *parent = dentry->d_parent; - __u32 *datap; - dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev; - int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root); +static struct super_block *exp_sb(struct svc_export *exp) +{ + return exp->ex_path.dentry->d_inode->i_sb; +} - dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", - MAJOR(ex_dev), MINOR(ex_dev), - (long) exp->ex_path.dentry->d_inode->i_ino, - parent->d_name.name, dentry->d_name.name, - (inode ? inode->i_ino : 0)); +static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) +{ + switch (fsid_type) { + case FSID_DEV: + if (!old_valid_dev(exp_sb(exp)->s_dev)) + return 0; + /* FALL THROUGH */ + case FSID_MAJOR_MINOR: + case FSID_ENCODE_DEV: + return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; + case FSID_NUM: + return exp->ex_flags & NFSEXP_FSID; + case FSID_UUID8: + case FSID_UUID16: + if (!is_root_export(exp)) + return 0; + /* fall through */ + case FSID_UUID4_INUM: + case FSID_UUID16_INUM: + return exp->ex_uuid != NULL; + } + return 1; +} - /* Choose filehandle version and fsid type based on - * the reference filehandle (if it is in the same export) - * or the export options. - */ - retry: + +static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh) +{ + u8 version; + u8 fsid_type; +retry: version = 1; if (ref_fh && ref_fh->fh_export == exp) { version = ref_fh->fh_handle.fh_version; fsid_type = ref_fh->fh_handle.fh_fsid_type; - if (ref_fh == fhp) - fh_put(ref_fh); ref_fh = NULL; switch (version) { @@ -447,58 +454,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, goto retry; } - /* Need to check that this type works for this - * export point. As the fsid -> filesystem mapping - * was guided by user-space, there is no guarantee - * that the filesystem actually supports that fsid - * type. If it doesn't we loop around again without - * ref_fh set. + /* + * As the fsid -> filesystem mapping was guided by + * user-space, there is no guarantee that the filesystem + * actually supports that fsid type. If it doesn't we + * loop around again without ref_fh set. */ - switch(fsid_type) { - case FSID_DEV: - if (!old_valid_dev(ex_dev)) - goto retry; - /* FALL THROUGH */ - case FSID_MAJOR_MINOR: - case FSID_ENCODE_DEV: - if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags - & FS_REQUIRES_DEV)) - goto retry; - break; - case FSID_NUM: - if (! (exp->ex_flags & NFSEXP_FSID)) - goto retry; - break; - case FSID_UUID8: - case FSID_UUID16: - if (!root_export) - goto retry; - /* fall through */ - case FSID_UUID4_INUM: - case FSID_UUID16_INUM: - if (exp->ex_uuid == NULL) - goto retry; - break; - } + if (!fsid_type_ok_for_exp(fsid_type, exp)) + goto retry; } else if (exp->ex_flags & NFSEXP_FSID) { fsid_type = FSID_NUM; } else if (exp->ex_uuid) { if (fhp->fh_maxsize >= 64) { - if (root_export) + if (is_root_export(exp)) fsid_type = FSID_UUID16; else fsid_type = FSID_UUID16_INUM; } else { - if (root_export) + if (is_root_export(exp)) fsid_type = FSID_UUID8; else fsid_type = FSID_UUID4_INUM; } - } else if (!old_valid_dev(ex_dev)) + } else if (!old_valid_dev(exp_sb(exp)->s_dev)) /* for newer device numbers, we must use a newer fsid format */ fsid_type = FSID_ENCODE_DEV; else fsid_type = FSID_DEV; + fhp->fh_handle.fh_version = version; + if (version) + fhp->fh_handle.fh_fsid_type = fsid_type; +} + +__be32 +fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, + struct svc_fh *ref_fh) +{ + /* ref_fh is a reference file handle. + * if it is non-null and for the same filesystem, then we should compose + * a filehandle which is of the same version, where possible. + * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca + * Then create a 32byte filehandle using nfs_fhbase_old + * + */ + + struct inode * inode = dentry->d_inode; + struct dentry *parent = dentry->d_parent; + __u32 *datap; + dev_t ex_dev = exp_sb(exp)->s_dev; + + dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", + MAJOR(ex_dev), MINOR(ex_dev), + (long) exp->ex_path.dentry->d_inode->i_ino, + parent->d_name.name, dentry->d_name.name, + (inode ? inode->i_ino : 0)); + + /* Choose filehandle version and fsid type based on + * the reference filehandle (if it is in the same export) + * or the export options. + */ + set_version_and_fsid_type(fhp, exp, ref_fh); if (ref_fh == fhp) fh_put(ref_fh); @@ -516,7 +531,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_export = exp; cache_get(&exp->h); - if (version == 0xca) { + if (fhp->fh_handle.fh_version == 0xca) { /* old style filehandle please */ memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); fhp->fh_handle.fh_size = NFS_FHSIZE; @@ -530,22 +545,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, _fh_update_old(dentry, exp, &fhp->fh_handle); } else { int len; - fhp->fh_handle.fh_version = 1; fhp->fh_handle.fh_auth_type = 0; datap = fhp->fh_handle.fh_auth+0; - fhp->fh_handle.fh_fsid_type = fsid_type; - mk_fsid(fsid_type, datap, ex_dev, + mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev, exp->ex_path.dentry->d_inode->i_ino, exp->ex_fsid, exp->ex_uuid); - len = key_len(fsid_type); + len = key_len(fhp->fh_handle.fh_fsid_type); datap += len/4; fhp->fh_handle.fh_size = 4 + len; if (inode) _fh_update(fhp, exp, dentry); - if (fhp->fh_handle.fh_fileid_type == 255) + if (fhp->fh_handle.fh_fileid_type == 255) { + fh_put(fhp); return nfserr_opnotsupp; + } } return 0; @@ -639,8 +654,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp) case FSID_DEV: case FSID_ENCODE_DEV: case FSID_MAJOR_MINOR: - if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags - & FS_REQUIRES_DEV) + if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV) return FSIDSOURCE_DEV; break; case FSID_NUM: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 24d58adfe5f..67ea83eedd4 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -34,6 +34,7 @@ #include <linux/nfsd/syscall.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> +#include <linux/seq_file.h> #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -66,6 +67,16 @@ struct timeval nfssvc_boot; DEFINE_MUTEX(nfsd_mutex); struct svc_serv *nfsd_serv; +/* + * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. + * nfsd_drc_max_pages limits the total amount of memory available for + * version 4.1 DRC caches. + * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. + */ +spinlock_t nfsd_drc_lock; +unsigned int nfsd_drc_max_mem; +unsigned int nfsd_drc_mem_used; + #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -235,13 +246,12 @@ void nfsd_reset_versions(void) */ static void set_max_drc(void) { - /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ - #define NFSD_DRC_SIZE_SHIFT 7 - nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() - >> NFSD_DRC_SIZE_SHIFT; - nfsd_serv->sv_drc_pages_used = 0; - dprintk("%s svc_drc_max_pages %u\n", __func__, - nfsd_serv->sv_drc_max_pages); + #define NFSD_DRC_SIZE_SHIFT 10 + nfsd_drc_max_mem = (nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; + nfsd_drc_mem_used = 0; + spin_lock_init(&nfsd_drc_lock); + dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); } int nfsd_create_serv(void) @@ -401,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs) error = nfsd_racache_init(2*nrservs); if (error<0) goto out; - nfs4_state_start(); + error = nfs4_state_start(); + if (error) + goto out; nfsd_reset_versions(); @@ -569,10 +581,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) + rqstp->rq_res.head[0].iov_len; rqstp->rq_res.head[0].iov_len += sizeof(__be32); - /* NFSv4.1 DRC requires statp */ - if (rqstp->rq_vers == 4) - nfsd4_set_statp(rqstp, statp); - /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); @@ -607,7 +615,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { - if (nfsd_serv == NULL) + int ret; + mutex_lock(&nfsd_mutex); + if (nfsd_serv == NULL) { + mutex_unlock(&nfsd_mutex); return -ENODEV; - return svc_pool_stats_open(nfsd_serv, file); + } + /* bump up the psudo refcount while traversing */ + svc_get(nfsd_serv); + ret = svc_pool_stats_open(nfsd_serv, file); + mutex_unlock(&nfsd_mutex); + return ret; +} + +int nfsd_pool_stats_release(struct inode *inode, struct file *file) +{ + int ret = seq_release(inode, file); + mutex_lock(&nfsd_mutex); + /* this function really, really should have been called svc_put() */ + svc_destroy(nfsd_serv); + mutex_unlock(&nfsd_mutex); + return ret; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8fa09bfbcba..a293f027326 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -89,6 +89,12 @@ struct raparm_hbucket { #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; +static inline int +nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} + /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -115,7 +121,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, path_put(&path); goto out; } - if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { + if (nfsd_v4client(rqstp) || + (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ /* * This is subtle: path.dentry is *not* on path.mnt diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c668bca579c..6a2711f4c32 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -46,7 +46,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc) INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); } -static struct address_space_operations def_btnode_aops = { +static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 6bd84a0d823..fc8278c77cd 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = { .splice_read = generic_file_splice_read, }; -struct inode_operations nilfs_file_inode_operations = { +const struct inode_operations nilfs_file_inode_operations = { .truncate = nilfs_truncate, .setattr = nilfs_setattr, .permission = nilfs_permission, diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1b3c2bb20da..e6de0a27ab5 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -52,7 +52,7 @@ #include "dat.h" #include "ifile.h" -static struct address_space_operations def_gcinode_aops = { +static const struct address_space_operations def_gcinode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 807e584b163..2d2c501deb5 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return size; } -struct address_space_operations nilfs_aops = { +const struct address_space_operations nilfs_aops = { .writepage = nilfs_writepage, .readpage = nilfs_readpage, .sync_page = block_sync_page, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 156bf6091a9..b18c4998f8d 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -427,12 +427,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) } -static struct address_space_operations def_mdt_aops = { +static const struct address_space_operations def_mdt_aops = { .writepage = nilfs_mdt_write_page, .sync_page = block_sync_page, }; -static struct inode_operations def_mdt_iops; +static const struct inode_operations def_mdt_iops; static struct file_operations def_mdt_fops; /* diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index df70dadb336..ed02e886fa7 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -448,7 +448,7 @@ out: return err; } -struct inode_operations nilfs_dir_inode_operations = { +const struct inode_operations nilfs_dir_inode_operations = { .create = nilfs_create, .lookup = nilfs_lookup, .link = nilfs_link, @@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = { .permission = nilfs_permission, }; -struct inode_operations nilfs_special_inode_operations = { +const struct inode_operations nilfs_special_inode_operations = { .setattr = nilfs_setattr, .permission = nilfs_permission, }; -struct inode_operations nilfs_symlink_inode_operations = { +const struct inode_operations nilfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 724c63766e8..bad7368782d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -295,12 +295,12 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *); * Inodes and files operations */ extern struct file_operations nilfs_dir_operations; -extern struct inode_operations nilfs_file_inode_operations; +extern const struct inode_operations nilfs_file_inode_operations; extern struct file_operations nilfs_file_operations; -extern struct address_space_operations nilfs_aops; -extern struct inode_operations nilfs_dir_inode_operations; -extern struct inode_operations nilfs_special_inode_operations; -extern struct inode_operations nilfs_symlink_inode_operations; +extern const struct address_space_operations nilfs_aops; +extern const struct inode_operations nilfs_dir_inode_operations; +extern const struct inode_operations nilfs_special_inode_operations; +extern const struct inode_operations nilfs_symlink_inode_operations; /* * filesystem type diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 55f3d6b6073..644e66727dd 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -504,7 +504,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) return 0; } -static struct super_operations nilfs_sops = { +static const struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .destroy_inode = nilfs_destroy_inode, .dirty_inode = nilfs_dirty_inode, @@ -560,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, nilfs_nfs_get_inode); } -static struct export_operations nilfs_export_ops = { +static const struct export_operations nilfs_export_ops = { .fh_to_dentry = nilfs_fh_to_dentry, .fh_to_parent = nilfs_fh_to_parent, .get_parent = nilfs_get_parent, diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 50931b1ce4b..8b2549f672b 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -829,7 +829,7 @@ enum { /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask - is used to to obtain all flags that are valid for setting. */ + is used to obtain all flags that are valid for setting. */ /* * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index cd0be3f5c3c..a44b14cbcee 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } - if (likely(size >> PAGE_SHIFT < num_physpages)) + if (likely((size >> PAGE_SHIFT) < totalram_pages)) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 3fb96fcd4c8..e5df9d170b0 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); int ocfs2_read_quota_block(struct inode *inode, u64 v_block, struct buffer_head **bh); -extern struct dquot_operations ocfs2_quota_operations; +extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; int ocfs2_quota_setup(void); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 44f2a5e1d04..3cf0ec0acdd 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block, err = -EIO; mlog_errno(err); } - return err;; + return err; } /* Read data from global quotafile - avoid pagecache and such because we cannot @@ -849,7 +849,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) kmem_cache_free(ocfs2_dquot_cachep, dquot); } -struct dquot_operations ocfs2_quota_operations = { +const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a3f8871d21f..faca4720aa4 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount) return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); } -static struct quotactl_ops ocfs2_quotactl_ops = { +static const struct quotactl_ops ocfs2_quotactl_ops = { .quota_on = ocfs2_quota_on, .quota_off = ocfs2_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c7275cfbdcf..3680bae335b 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -489,7 +489,7 @@ out: return ret; } -struct inode_operations omfs_dir_inops = { +const struct inode_operations omfs_dir_inops = { .lookup = omfs_lookup, .mkdir = omfs_mkdir, .rename = omfs_rename, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d17e774eaf4..4845fbb18e6 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -struct inode_operations omfs_file_inops = { +const struct inode_operations omfs_file_inops = { .truncate = omfs_truncate }; -struct address_space_operations omfs_aops = { +const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, .readpages = omfs_readpages, .writepage = omfs_writepage, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 379ae5fb441..f3b7c1541f3 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct super_operations omfs_sops = { +static const struct super_operations omfs_sops = { .write_inode = omfs_write_inode, .delete_inode = omfs_delete_inode, .put_super = omfs_put_super, diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index 2bc0f067040..df71039945a 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -45,15 +45,15 @@ extern int omfs_clear_range(struct super_block *sb, u64 block, int count); /* dir.c */ extern struct file_operations omfs_dir_operations; -extern struct inode_operations omfs_dir_inops; +extern const struct inode_operations omfs_dir_inops; extern int omfs_make_empty(struct inode *inode, struct super_block *sb); extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, u64 fsblock); /* file.c */ extern struct file_operations omfs_file_operations; -extern struct inode_operations omfs_file_inops; -extern struct address_space_operations omfs_aops; +extern const struct inode_operations omfs_file_inops; +extern const struct address_space_operations omfs_aops; extern void omfs_make_empty_table(struct buffer_head *bh, int offset); extern int omfs_shrink_inode(struct inode *inode); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index fbeaddf595d..7b685e10cba 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -581,7 +581,7 @@ try_scan: } if (from + size > get_capacity(disk)) { - struct block_device_operations *bdops = disk->fops; + const struct block_device_operations *bdops = disk->fops; unsigned long long capacity; printk(KERN_WARNING diff --git a/fs/proc/base.c b/fs/proc/base.c index 6f742f6658a..55c4c805a75 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer) do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); - points = badness(task, uptime.tv_sec); + points = badness(task->group_leader, uptime.tv_sec); read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } @@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust; + int oom_adjust = OOM_DISABLE; + unsigned long flags; if (!task) return -ESRCH; - oom_adjust = task->oomkilladj; + + if (lock_task_sighand(task, &flags)) { + oom_adjust = task->signal->oom_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; - int oom_adjust; + char buffer[PROC_NUMBUF]; + long oom_adjust; + unsigned long flags; + int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - oom_adjust = simple_strtol(buffer, &end, 0); + + err = strict_strtol(strstrip(buffer), 0, &oom_adjust); + if (err) + return -EINVAL; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; - if (*end == '\n') - end++; + task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { + if (!lock_task_sighand(task, &flags)) { + put_task_struct(task); + return -ESRCH; + } + + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { + unlock_task_sighand(task, &flags); put_task_struct(task); return -EACCES; } - task->oomkilladj = oom_adjust; + + task->signal->oom_adj = oom_adjust; + + unlock_task_sighand(task, &flags); put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; + + return count; } static const struct file_operations proc_oom_adjust_operations = { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 59b43a06887..f06f45b4218 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -328,43 +328,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return -EFAULT; } else if (is_vmalloc_addr((void *)start)) { char * elf_buf; - struct vm_struct *m; - unsigned long curstart = start; - unsigned long cursize = tsz; elf_buf = kzalloc(tsz, GFP_KERNEL); if (!elf_buf) return -ENOMEM; - - read_lock(&vmlist_lock); - for (m=vmlist; m && cursize; m=m->next) { - unsigned long vmstart; - unsigned long vmsize; - unsigned long msize = m->size - PAGE_SIZE; - - if (((unsigned long)m->addr + msize) < - curstart) - continue; - if ((unsigned long)m->addr > (curstart + - cursize)) - break; - vmstart = (curstart < (unsigned long)m->addr ? - (unsigned long)m->addr : curstart); - if (((unsigned long)m->addr + msize) > - (curstart + cursize)) - vmsize = curstart + cursize - vmstart; - else - vmsize = (unsigned long)m->addr + - msize - vmstart; - curstart = vmstart + vmsize; - cursize -= vmsize; - /* don't dump ioremap'd stuff! (TA) */ - if (m->flags & VM_IOREMAP) - continue; - memcpy(elf_buf + (vmstart - start), - (char *)vmstart, vmsize); - } - read_unlock(&vmlist_lock); + vread(elf_buf, (char *)start, tsz); + /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, elf_buf, tsz)) { kfree(elf_buf); return -EFAULT; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d5c410d47fa..171e052c07b 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Writeback: %8lu kB\n" "AnonPages: %8lu kB\n" "Mapped: %8lu kB\n" + "Shmem: %8lu kB\n" "Slab: %8lu kB\n" "SReclaimable: %8lu kB\n" "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" "PageTables: %8lu kB\n" #ifdef CONFIG_QUICKLIST "Quicklists: %8lu kB\n" @@ -124,10 +126,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), + K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, K(global_page_state(NR_PAGETABLE)), #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), diff --git a/fs/proc/page.c b/fs/proc/page.c index 2707c6c7a20..2281c2cbfe2 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -2,6 +2,7 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/init.h> +#include <linux/ksm.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/proc_fs.h> @@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_UNEVICTABLE 18 #define KPF_NOPAGE 20 +#define KPF_KSM 21 + /* kernel hacking assistances * WARNING: subject to change, never rely on them! */ @@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; + if (PageKsm(page)) + u |= 1 << KPF_KSM; /* * compound pages: export both head/tail info diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9bd8be1d235..59e98fea34a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -465,6 +465,10 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } +#define CLEAR_REFS_ALL 1 +#define CLEAR_REFS_ANON 2 +#define CLEAR_REFS_MAPPED 3 + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -472,13 +476,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF], *end; struct mm_struct *mm; struct vm_area_struct *vma; + int type; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - if (!simple_strtol(buffer, &end, 0)) + type = simple_strtol(buffer, &end, 0); + if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) return -EINVAL; if (*end == '\n') end++; @@ -494,9 +500,23 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { clear_refs_walk.private = vma; - if (!is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_start, vma->vm_end, - &clear_refs_walk); + if (is_vm_hugetlb_page(vma)) + continue; + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * + * Writing 2 to /proc/pid/clear_refs only affects + * Anonymous pages. + * + * Writing 3 to /proc/pid/clear_refs only affects file + * mapped pages. + */ + if (type == CLEAR_REFS_ANON && vma->vm_file) + continue; + if (type == CLEAR_REFS_MAPPED && !vma->vm_file) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); } flush_tlb_mm(mm); up_read(&mm->mmap_sem); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 38f7bd559f3..39b49c42a7e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info); /* * Definitions of diskquota operations. */ -struct dquot_operations dquot_operations = { +const struct dquot_operations dquot_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -2461,7 +2461,7 @@ out: } EXPORT_SYMBOL(vfs_set_dqinfo); -struct quotactl_ops vfs_quotactl_ops = { +const struct quotactl_ops vfs_quotactl_ops = { .quota_on = vfs_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7adea74d6a8..f0ad05f3802 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *, int); -static struct dquot_operations reiserfs_quota_operations = { +static const struct dquot_operations reiserfs_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = { .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops reiserfs_qctl_operations = { +static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 4ab3c03d8f9..47f132df0c3 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = { .readdir = romfs_readdir, }; -static struct inode_operations romfs_dir_inode_operations = { +static const struct inode_operations romfs_dir_inode_operations = { .lookup = romfs_lookup, }; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index cb5fc57e370..6c197ef53ad 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -44,7 +44,7 @@ #include "squashfs.h" static struct file_system_type squashfs_fs_type; -static struct super_operations squashfs_super_ops; +static const struct super_operations squashfs_super_ops; static int supported_squashfs_filesystem(short major, short minor, short comp) { @@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = { .fs_flags = FS_REQUIRES_DEV }; -static struct super_operations squashfs_super_ops = { +static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, .destroy_inode = squashfs_destroy_inode, .statfs = squashfs_statfs, diff --git a/fs/super.c b/fs/super.c index b03fea8fbfb..0e7207b9815 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock); static struct super_block *alloc_super(struct file_system_type *type) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); - static struct super_operations default_op; + static const struct super_operations default_op; if (s) { if (security_sb_alloc(s)) { diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 7998cc37825..195830f4756 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -79,7 +79,7 @@ enum { }; static const struct inode_operations none_inode_operations; -static struct address_space_operations none_address_operations; +static const struct address_space_operations none_address_operations; static const struct file_operations none_file_operations; /** diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index cb6e2cca214..9e41f91aa26 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -150,7 +150,7 @@ xfs_fs_set_xquota( return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } -struct quotactl_ops xfs_quotactl_operations = { +const struct quotactl_ops xfs_quotactl_operations = { .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 5d7c60ac77b..bdd41c8c342 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -67,7 +67,7 @@ #include <linux/freezer.h> #include <linux/parser.h> -static struct super_operations xfs_super_operations; +static const struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; @@ -1536,7 +1536,7 @@ xfs_fs_get_sb( mnt); } -static struct super_operations xfs_super_operations = { +static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .write_inode = xfs_fs_write_inode, diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 5a2ea3a2178..18175ebd58e 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; extern struct xattr_handler *xfs_xattr_handlers[]; -extern struct quotactl_ops xfs_quotactl_operations; +extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c4ea51b55dc..f52ac276277 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -117,7 +117,7 @@ struct getbmapx { #define BMV_IF_VALID \ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) -/* bmv_oflags values - returned for for each non-header segment */ +/* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ #define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ #define BMV_OF_LAST 0x4 /* segment is the last in the file */ diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 37ba576d06e..8052236d1a3 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -288,7 +288,7 @@ typedef u32 acpi_physical_address; /* * Some compilers complain about unused variables. Sometimes we don't want to * use all the variables (for example, _acpi_module_name). This allows us - * to to tell the compiler in a per-variable manner that a variable + * to tell the compiler in a per-variable manner that a variable * is unused */ #ifndef ACPI_UNUSED_VAR diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index 935c5d7fc86..6aadbf84ae7 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -57,7 +57,7 @@ /* * Some compilers complain about unused variables. Sometimes we don't want to * use all the variables (for example, _acpi_module_name). This allows us - * to to tell the compiler warning in a per-variable manner that a variable + * to tell the compiler warning in a per-variable manner that a variable * is unused. */ #define ACPI_UNUSED_VAR __attribute__ ((unused)) diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 3b69ad34189..dd63bd38864 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -35,6 +35,9 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ +#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h index 7cab4de2bca..32c8bd6a196 100644 --- a/include/asm-generic/mman.h +++ b/include/asm-generic/mman.h @@ -11,6 +11,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index bc3ab707369..dd97fb8408a 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -132,9 +132,6 @@ static inline void *alloc_remap(int nid, unsigned long size) } #endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */ -extern unsigned long __meminitdata nr_kernel_pages; -extern unsigned long __meminitdata nr_all_pages; - extern void *alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, @@ -145,6 +142,8 @@ extern void *alloc_large_system_hash(const char *tablename, unsigned long limit); #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ +#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min + * shift passed via *_hash_shift */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. diff --git a/include/linux/capability.h b/include/linux/capability.h index c3021105edc..c8f2a5f70ed 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -7,7 +7,7 @@ * * See here for the libcap library ("POSIX draft" compliance): * - * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ + * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h index 45ff1849151..1d747f72298 100644 --- a/include/linux/flex_array.h +++ b/include/linux/flex_array.h @@ -31,10 +31,32 @@ struct flex_array { }; }; -#define FLEX_ARRAY_INIT(size, total) { { {\ - .element_size = (size), \ - .total_nr_elements = (total), \ -} } } +/* Number of bytes left in base struct flex_array, excluding metadata */ +#define FLEX_ARRAY_BASE_BYTES_LEFT \ + (FLEX_ARRAY_BASE_SIZE - offsetof(struct flex_array, parts)) + +/* Number of pointers in base to struct flex_array_part pages */ +#define FLEX_ARRAY_NR_BASE_PTRS \ + (FLEX_ARRAY_BASE_BYTES_LEFT / sizeof(struct flex_array_part *)) + +/* Number of elements of size that fit in struct flex_array_part */ +#define FLEX_ARRAY_ELEMENTS_PER_PART(size) \ + (FLEX_ARRAY_PART_SIZE / size) + +/* + * Defines a statically allocated flex array and ensures its parameters are + * valid. + */ +#define DEFINE_FLEX_ARRAY(__arrayname, __element_size, __total) \ + struct flex_array __arrayname = { { { \ + .element_size = (__element_size), \ + .total_nr_elements = (__total), \ + } } }; \ + static inline void __arrayname##_invalid_parameter(void) \ + { \ + BUILD_BUG_ON((__total) > FLEX_ARRAY_NR_BASE_PTRS * \ + FLEX_ARRAY_ELEMENTS_PER_PART(__element_size)); \ + } struct flex_array *flex_array_alloc(int element_size, unsigned int total, gfp_t flags); @@ -44,6 +66,8 @@ void flex_array_free(struct flex_array *fa); void flex_array_free_parts(struct flex_array *fa); int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, gfp_t flags); +int flex_array_clear(struct flex_array *fa, unsigned int element_nr); void *flex_array_get(struct flex_array *fa, unsigned int element_nr); +int flex_array_shrink(struct flex_array *fa); #endif /* _FLEX_ARRAY_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 90162fb3bf0..51803528b09 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1066,8 +1066,8 @@ struct file_lock { struct fasync_struct * fl_fasync; /* for lease break notifications */ unsigned long fl_break_time; /* for nonblocking lease breaks */ - struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ - struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ + const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ + const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; @@ -1318,8 +1318,8 @@ struct super_block { unsigned long long s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; - struct dquot_operations *dq_op; - struct quotactl_ops *s_qcop; + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_magic; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 109d179adb9..297df45ffd0 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -151,7 +151,7 @@ struct gendisk { struct disk_part_tbl *part_tbl; struct hd_struct part0; - struct block_device_operations *fops; + const struct block_device_operations *fops; struct request_queue *queue; void *private_data; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7c777a0da17..f53e9b868c2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -326,7 +326,6 @@ void free_pages_exact(void *virt, size_t size); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_page(struct page *page); -extern void free_cold_page(struct page *page); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) @@ -336,18 +335,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); -extern bool oom_killer_disabled; - -static inline void oom_killer_disable(void) -{ - oom_killer_disabled = true; -} - -static inline void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} - extern gfp_t gfp_allowed_mask; static inline void set_gfp_allowed_mask(gfp_t mask) diff --git a/include/linux/hid.h b/include/linux/hid.h index a0ebdace7ba..10f62841674 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -494,6 +494,7 @@ struct hid_device { /* device report descriptor */ /* hiddev event handler */ int (*hiddev_connect)(struct hid_device *, unsigned int); + void (*hiddev_disconnect)(struct hid_device *); void (*hiddev_hid_event) (struct hid_device *, struct hid_field *field, struct hid_usage *, __s32); void (*hiddev_report_event) (struct hid_device *, struct hid_report *); @@ -691,6 +692,7 @@ struct hid_device *hid_allocate_device(void); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); +void hid_disconnect(struct hid_device *hid); /** * hid_map_usage - map usage input bits @@ -800,6 +802,7 @@ static inline int __must_check hid_hw_start(struct hid_device *hdev, */ static inline void hid_hw_stop(struct hid_device *hdev) { + hid_disconnect(hdev); hdev->ll_driver->stop(hdev); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5cbc620bdfe..176e7ee73ef 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -24,7 +24,9 @@ int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user * int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); -int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, int); +int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, + struct page **, struct vm_area_struct **, + unsigned long *, int *, int, unsigned int flags); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *); void __unmap_hugepage_range(struct vm_area_struct *, @@ -110,6 +112,21 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #endif /* !CONFIG_HUGETLB_PAGE */ +#define HUGETLB_ANON_FILE "anon_hugepage" + +enum { + /* + * The file will be used as an shm file so shmfs accounting rules + * apply + */ + HUGETLB_SHMFS_INODE = 1, + /* + * The file is being created on the internal vfs mount and shmfs + * accounting rules do not apply + */ + HUGETLB_ANONHUGE_INODE = 2, +}; + #ifdef CONFIG_HUGETLBFS struct hugetlbfs_config { uid_t uid; @@ -148,7 +165,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, int acct, - struct user_struct **user); + struct user_struct **user, int creat_flags); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); @@ -170,7 +187,7 @@ static inline void set_file_hugepages(struct file *file) #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() -#define hugetlb_file_setup(name,size,acct,user) ERR_PTR(-ENOSYS) +#define hugetlb_file_setup(name,size,acct,user,creat) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ @@ -185,7 +202,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { - int hugetlb_next_nid; + int next_nid_to_alloc; + int next_nid_to_free; unsigned int order; unsigned long mask; unsigned long max_huge_pages; diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index dc2fd545db0..c8006607f94 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -144,7 +144,10 @@ static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) int name##_end[0]; #define kmemcheck_annotate_bitfield(ptr, name) \ - do if (ptr) { \ + do { \ + if (!ptr) \ + break; \ + \ int _n = (long) &((ptr)->name##_end) \ - (long) &((ptr)->name##_begin); \ BUILD_BUG_ON(_n < 0); \ diff --git a/include/linux/ksm.h b/include/linux/ksm.h new file mode 100644 index 00000000000..a485c14ecd5 --- /dev/null +++ b/include/linux/ksm.h @@ -0,0 +1,79 @@ +#ifndef __LINUX_KSM_H +#define __LINUX_KSM_H +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork(). + */ + +#include <linux/bitops.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/vmstat.h> + +#ifdef CONFIG_KSM +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +int __ksm_enter(struct mm_struct *mm); +void __ksm_exit(struct mm_struct *mm); + +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + return __ksm_enter(mm); + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + __ksm_exit(mm); +} + +/* + * A KSM page is one of those write-protected "shared pages" or "merged pages" + * which KSM maps into multiple mms, wherever identical anonymous page content + * is found in VM_MERGEABLE vmas. It's a PageAnon page, with NULL anon_vma. + */ +static inline int PageKsm(struct page *page) +{ + return ((unsigned long)page->mapping == PAGE_MAPPING_ANON); +} + +/* + * But we have to avoid the checking which page_add_anon_rmap() performs. + */ +static inline void page_add_ksm_rmap(struct page *page) +{ + if (atomic_inc_and_test(&page->_mapcount)) { + page->mapping = (void *) PAGE_MAPPING_ANON; + __inc_zone_page_state(page, NR_ANON_PAGES); + } +} +#else /* !CONFIG_KSM */ + +static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + return 0; +} + +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ +} + +static inline int PageKsm(struct page *page) +{ + return 0; +} + +/* No stub required for page_add_ksm_rmap(page) */ +#endif /* !CONFIG_KSM */ + +#endif diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h index ad651f4e45a..3cc2f2c53e4 100644 --- a/include/linux/lis3lv02d.h +++ b/include/linux/lis3lv02d.h @@ -32,8 +32,17 @@ struct lis3lv02d_platform_data { #define LIS3_IRQ2_DATA_READY (4 << 3) #define LIS3_IRQ2_CLICK (7 << 3) #define LIS3_IRQ_OPEN_DRAIN (1 << 6) -#define LIS3_IRQ_ACTIVE_HIGH (1 << 7) +#define LIS3_IRQ_ACTIVE_LOW (1 << 7) unsigned char irq_cfg; + +#define LIS3_WAKEUP_X_LO (1 << 0) +#define LIS3_WAKEUP_X_HI (1 << 1) +#define LIS3_WAKEUP_Y_LO (1 << 2) +#define LIS3_WAKEUP_Y_HI (1 << 3) +#define LIS3_WAKEUP_Z_LO (1 << 4) +#define LIS3_WAKEUP_Z_HI (1 << 5) + unsigned char wakeup_flags; + unsigned char wakeup_thresh; }; #endif /* __LIS3LV02D_H_ */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c325b187966..a34dea46b62 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -338,49 +338,6 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) } } -static inline int __nlm_cmp_addr4(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); -} -#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ -static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - return 0; -} -#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ - -/* - * Compare two host addresses - * - * Return TRUE if the addresses are the same; otherwise FALSE. - */ -static inline int nlm_cmp_addr(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - if (sap1->sa_family == sap2->sa_family) { - switch (sap1->sa_family) { - case AF_INET: - return __nlm_cmp_addr4(sap1, sap2); - case AF_INET6: - return __nlm_cmp_addr6(sap1, sap2); - } - } - return 0; -} - /* * Compare two NLM locks. * When the second lock is of type F_UNLCK, this acts like a wildcard. @@ -395,7 +352,7 @@ static inline int nlm_compare_locks(const struct file_lock *fl1, &&(fl1->fl_type == fl2->fl_type || fl2->fl_type == F_UNLCK); } -extern struct lock_manager_operations nlmsvc_lock_operations; +extern const struct lock_manager_operations nlmsvc_lock_operations; #endif /* __KERNEL__ */ diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index 536ca12442c..78c3bed1c3f 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -104,7 +104,7 @@ #define DL_UNITDATA_IND 0x3108 #define DL_INFORMATION_IND 0x0008 -/* intern layer 2 managment */ +/* intern layer 2 management */ #define MDL_ASSIGN_REQ 0x1804 #define MDL_ASSIGN_IND 0x1904 #define MDL_REMOVE_REQ 0x1A04 diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 9be484d1128..7c08052e332 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -47,22 +47,16 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) } /* - * 2 mempool_alloc_t's and a mempool_free_t to kmalloc/kzalloc and kfree - * the amount of memory specified by pool_data + * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the + * amount of memory specified by pool_data */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); -void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data); void mempool_kfree(void *element, void *pool_data); static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) { return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, (void *) size); } -static inline mempool_t *mempool_create_kzalloc_pool(int min_nr, size_t size) -{ - return mempool_create(min_nr, mempool_kzalloc, mempool_kfree, - (void *) size); -} /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a72cc78e6b..5946e2ff9fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -25,6 +25,7 @@ extern unsigned long max_mapnr; #endif extern unsigned long num_physpages; +extern unsigned long totalram_pages; extern void * high_memory; extern int page_cluster; @@ -103,6 +104,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ +#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -700,17 +702,8 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(void); -#ifdef CONFIG_SHMEM -extern int shmem_lock(struct file *file, int lock, struct user_struct *user); -#else -static inline int shmem_lock(struct file *file, int lock, - struct user_struct *user) -{ - return 0; -} -#endif +int shmem_lock(struct file *file, int lock, struct user_struct *user); struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); - int shmem_zero_setup(struct vm_area_struct *); #ifndef CONFIG_MMU @@ -815,6 +808,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page **pages, struct vm_area_struct **vmas); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); @@ -1058,6 +1052,8 @@ extern void setup_per_cpu_pageset(void); static inline void setup_per_cpu_pageset(void) {} #endif +extern void zone_pcp_update(struct zone *zone); + /* nommu.c */ extern atomic_long_t mmap_pages_allocated; @@ -1226,7 +1222,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ #define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ +#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ +#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7fbb9726755..8835b877b8d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -5,7 +5,7 @@ * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test * - * Returns LRU_FILE if @page is page cache page backed by a regular filesystem, + * Returns 1 if @page is page cache page backed by a regular filesystem, * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed. * Used by functions that manipulate the LRU lists, to sort a page * onto the right LRU list. @@ -16,11 +16,7 @@ */ static inline int page_is_file_cache(struct page *page) { - if (PageSwapBacked(page)) - return 0; - - /* The page is page cache backed by a normal filesystem. */ - return LRU_FILE; + return !PageSwapBacked(page); } static inline void @@ -39,21 +35,36 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) mem_cgroup_del_lru_list(page, l); } +/** + * page_lru_base_type - which LRU list type should a page be on? + * @page: the page to test + * + * Used for LRU list index arithmetic. + * + * Returns the base LRU type - file or anon - @page should be on. + */ +static inline enum lru_list page_lru_base_type(struct page *page) +{ + if (page_is_file_cache(page)) + return LRU_INACTIVE_FILE; + return LRU_INACTIVE_ANON; +} + static inline void del_page_from_lru(struct zone *zone, struct page *page) { - enum lru_list l = LRU_BASE; + enum lru_list l; list_del(&page->lru); if (PageUnevictable(page)) { __ClearPageUnevictable(page); l = LRU_UNEVICTABLE; } else { + l = page_lru_base_type(page); if (PageActive(page)) { __ClearPageActive(page); l += LRU_ACTIVE; } - l += page_is_file_cache(page); } __dec_zone_state(zone, NR_LRU_BASE + l); mem_cgroup_del_lru_list(page, l); @@ -68,14 +79,14 @@ del_page_from_lru(struct zone *zone, struct page *page) */ static inline enum lru_list page_lru(struct page *page) { - enum lru_list lru = LRU_BASE; + enum lru_list lru; if (PageUnevictable(page)) lru = LRU_UNEVICTABLE; else { + lru = page_lru_base_type(page); if (PageActive(page)) lru += LRU_ACTIVE; - lru += page_is_file_cache(page); } return lru; diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h new file mode 100644 index 00000000000..70fffeba749 --- /dev/null +++ b/include/linux/mmu_context.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_MMU_CONTEXT_H +#define _LINUX_MMU_CONTEXT_H + +struct mm_struct; + +void use_mm(struct mm_struct *mm); +void unuse_mm(struct mm_struct *mm); + +#endif diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b77486d152c..4e02ee2b071 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -62,6 +62,15 @@ struct mmu_notifier_ops { unsigned long address); /* + * change_pte is called in cases that pte mapping to page is changed: + * for example, when ksm remaps pte to point to a new shared page. + */ + void (*change_pte)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte); + + /* * Before this is invoked any secondary MMU is still ok to * read/write to the page previously pointed to by the Linux * pte because the page hasn't been freed yet and it won't be @@ -154,6 +163,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern void __mmu_notifier_change_pte(struct mm_struct *mm, + unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, @@ -175,6 +186,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline void mmu_notifier_change_pte(struct mm_struct *mm, + unsigned long address, pte_t pte) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_change_pte(mm, address, pte); +} + static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { @@ -236,6 +254,16 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define set_pte_at_notify(__mm, __address, __ptep, __pte) \ +({ \ + struct mm_struct *___mm = __mm; \ + unsigned long ___address = __address; \ + pte_t ___pte = __pte; \ + \ + set_pte_at(___mm, ___address, __ptep, ___pte); \ + mmu_notifier_change_pte(___mm, ___address, ___pte); \ +}) + #else /* CONFIG_MMU_NOTIFIER */ static inline void mmu_notifier_release(struct mm_struct *mm) @@ -248,6 +276,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline void mmu_notifier_change_pte(struct mm_struct *mm, + unsigned long address, pte_t pte) +{ +} + static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { @@ -273,6 +306,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush +#define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 88959853737..652ef01be58 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -38,6 +38,7 @@ #define MIGRATE_UNMOVABLE 0 #define MIGRATE_RECLAIMABLE 1 #define MIGRATE_MOVABLE 2 +#define MIGRATE_PCPTYPES 3 /* the number of types on the pcp lists */ #define MIGRATE_RESERVE 3 #define MIGRATE_ISOLATE 4 /* can't allocate from here */ #define MIGRATE_TYPES 5 @@ -94,11 +95,15 @@ enum zone_stat_item { NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ + NR_KERNEL_STACK, + /* Second 128 byte cacheline */ NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, - /* Second 128 byte cacheline */ NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ + NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ + NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ @@ -165,7 +170,9 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ - struct list_head list; /* the list of pages */ + + /* Lists of pages, one per migrate type stored on the pcp-lists */ + struct list_head lists[MIGRATE_PCPTYPES]; }; struct per_cpu_pageset { @@ -269,6 +276,11 @@ struct zone_reclaim_stat { */ unsigned long recent_rotated[2]; unsigned long recent_scanned[2]; + + /* + * accumulated for batching + */ + unsigned long nr_saved_scan[NR_LRU_LISTS]; }; struct zone { @@ -323,7 +335,6 @@ struct zone { spinlock_t lru_lock; struct zone_lru { struct list_head list; - unsigned long nr_saved_scan; /* accumulated for batching */ } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; diff --git a/include/linux/namei.h b/include/linux/namei.h index d870ae2faed..ec0f607b364 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -40,7 +40,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; * - follow links at the end * - require a directory * - ending slashes ok even for nonexistent files - * - internal "there are more path compnents" flag + * - internal "there are more path components" flag * - locked when lookup done with dcache_lock held * - dentry cache is untrusted; force a real lookup */ diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 33b283601f6..c4c06020810 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -234,7 +234,7 @@ enum nfs_opnum4 { Needs to be updated if more operations are defined in future.*/ #define FIRST_NFS4_OP OP_ACCESS -#define LAST_NFS4_OP OP_RELEASE_LOCKOWNER +#define LAST_NFS4_OP OP_RECLAIM_COMPLETE enum nfsstat4 { NFS4_OK = 0, diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 2b49d676d0c..03bbe903910 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -56,6 +56,9 @@ extern struct svc_version nfsd_version2, nfsd_version3, extern u32 nfsd_supported_minorversion; extern struct mutex nfsd_mutex; extern struct svc_serv *nfsd_serv; +extern spinlock_t nfsd_drc_lock; +extern unsigned int nfsd_drc_max_mem; +extern unsigned int nfsd_drc_mem_used; extern struct seq_operations nfs_exports_op; @@ -163,7 +166,7 @@ extern int nfsd_max_blksize; extern unsigned int max_delegations; int nfs4_state_init(void); void nfsd4_free_slabs(void); -void nfs4_state_start(void); +int nfs4_state_start(void); void nfs4_state_shutdown(void); time_t nfs4_lease_time(void); void nfs4_reset_lease(time_t leasetime); @@ -171,7 +174,7 @@ int nfs4_reset_recoverydir(char *recdir); #else static inline int nfs4_state_init(void) { return 0; } static inline void nfsd4_free_slabs(void) { } -static inline void nfs4_state_start(void) { } +static inline int nfs4_state_start(void) { return 0; } static inline void nfs4_state_shutdown(void) { } static inline time_t nfs4_lease_time(void) { return 0; } static inline void nfs4_reset_lease(time_t leasetime) { } diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 57ab2ed0845..b38d1132418 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -60,6 +60,12 @@ typedef struct { #define si_stateownerid si_opaque.so_stateownerid #define si_fileid si_opaque.so_fileid +struct nfsd4_cb_sequence { + /* args/res */ + u32 cbs_minorversion; + struct nfs4_client *cbs_clp; +}; + struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; @@ -81,38 +87,35 @@ struct nfs4_delegation { /* client delegation callback info */ struct nfs4_cb_conn { /* SETCLIENTID info */ - u32 cb_addr; - unsigned short cb_port; + struct sockaddr_storage cb_addr; + size_t cb_addrlen; u32 cb_prog; u32 cb_minorversion; u32 cb_ident; /* minorversion 0 only */ /* RPC client info */ atomic_t cb_set; /* successful CB_NULL call */ struct rpc_clnt * cb_client; - struct rpc_cred * cb_cred; }; -/* Maximum number of slots per session. 128 is useful for long haul TCP */ -#define NFSD_MAX_SLOTS_PER_SESSION 128 -/* Maximum number of pages per slot cache entry */ -#define NFSD_PAGES_PER_SLOT 1 +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ #define NFSD_MAX_OPS_PER_COMPOUND 16 - -struct nfsd4_cache_entry { - __be32 ce_status; - struct kvec ce_datav; /* encoded NFSv4.1 data in rq_res.head[0] */ - struct page *ce_respages[NFSD_PAGES_PER_SLOT + 1]; - int ce_cachethis; - short ce_resused; - int ce_opcnt; - int ce_rpchdrlen; -}; +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 1024 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) struct nfsd4_slot { - bool sl_inuse; - u32 sl_seqid; - struct nfsd4_cache_entry sl_cache_entry; + bool sl_inuse; + bool sl_cachethis; + u16 sl_opcnt; + u32 sl_seqid; + __be32 sl_status; + u32 sl_datalen; + char sl_data[]; }; struct nfsd4_channel_attrs { @@ -126,6 +129,25 @@ struct nfsd4_channel_attrs { u32 rdma_attrs; }; +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + u32 uid; + u32 gid; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + struct nfsd4_session { struct kref se_ref; struct list_head se_hash; /* hash by sessionid */ @@ -135,7 +157,7 @@ struct nfsd4_session { struct nfs4_sessionid se_sessionid; struct nfsd4_channel_attrs se_fchannel; struct nfsd4_channel_attrs se_bchannel; - struct nfsd4_slot se_slots[]; /* forward channel slots */ + struct nfsd4_slot *se_slots[]; /* forward channel slots */ }; static inline void @@ -180,7 +202,7 @@ struct nfs4_client { char cl_recdir[HEXDIR_LEN]; /* recovery dir */ nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ - __be32 cl_addr; /* client ipaddress */ + struct sockaddr_storage cl_addr; /* client ipaddress */ u32 cl_flavor; /* setclientid pseudoflavor */ char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ @@ -192,9 +214,17 @@ struct nfs4_client { /* for nfs41 */ struct list_head cl_sessions; - struct nfsd4_slot cl_slot; /* create_session slot */ + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ u32 cl_exchange_flags; struct nfs4_sessionid cl_sessionid; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; + struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ }; /* struct nfs4_client_reset @@ -345,6 +375,7 @@ extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); extern void put_nfs4_client(struct nfs4_client *clp); extern void nfs4_free_stateowner(struct kref *kref); +extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_cb_recall(struct nfs4_delegation *dp); extern void nfs4_put_delegation(struct nfs4_delegation *dp); diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h index 2bacf753506..73164c2b3d2 100644 --- a/include/linux/nfsd/xdr4.h +++ b/include/linux/nfsd/xdr4.h @@ -51,7 +51,7 @@ struct nfsd4_compound_state { /* For sessions DRC */ struct nfsd4_session *session; struct nfsd4_slot *slot; - __be32 *statp; + __be32 *datap; size_t iovlen; u32 minorversion; u32 status; @@ -366,18 +366,6 @@ struct nfsd4_exchange_id { int spa_how; }; -struct nfsd4_create_session { - clientid_t clientid; - struct nfs4_sessionid sessionid; - u32 seqid; - u32 flags; - struct nfsd4_channel_attrs fore_channel; - struct nfsd4_channel_attrs back_channel; - u32 callback_prog; - u32 uid; - u32 gid; -}; - struct nfsd4_sequence { struct nfs4_sessionid sessionid; /* request/response */ u32 seqid; /* request/response */ @@ -479,13 +467,12 @@ struct nfsd4_compoundres { static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) { struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; - return args->opcnt == 1; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; } static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) { - return !resp->cstate.slot->sl_cache_entry.ce_cachethis || - nfsd4_is_solo_sequence(resp); + return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); } #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) diff --git a/include/linux/oom.h b/include/linux/oom.h index a7979baf1e3..6aac5fe4f6f 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -30,5 +30,16 @@ extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); +extern bool oom_killer_disabled; + +static inline void oom_killer_disable(void) +{ + oom_killer_disabled = true; +} + +static inline void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} #endif /* __KERNEL__*/ #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2b87acfc5f8..13de789f0a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -158,6 +158,9 @@ static inline int TestSetPage##uname(struct page *page) \ static inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &page->flags); } +#define __TESTCLEARFLAG(uname, lname) \ +static inline int __TestClearPage##uname(struct page *page) \ + { return __test_and_clear_bit(PG_##lname, &page->flags); } #define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) @@ -184,6 +187,9 @@ static inline void __ClearPage##uname(struct page *page) { } #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } +#define __TESTCLEARFLAG_FALSE(uname) \ +static inline int __TestClearPage##uname(struct page *page) { return 0; } + struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) @@ -250,11 +256,11 @@ PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define MLOCK_PAGES 1 PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) - TESTSCFLAG(Mlocked, mlocked) + TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) #else #define MLOCK_PAGES 0 -PAGEFLAG_FALSE(Mlocked) - SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) + TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED @@ -396,8 +402,8 @@ static inline void __ClearPageTail(struct page *page) */ #define PAGE_FLAGS_CHECK_AT_PREP ((1 << NR_PAGEFLAGS) - 1) -#endif /* !__GENERATING_BOUNDS_H */ - +#define PAGE_FLAGS_PRIVATE \ + (1 << PG_private | 1 << PG_private_2) /** * page_has_private - Determine if page has private stuff * @page: The page to be checked @@ -405,8 +411,11 @@ static inline void __ClearPageTail(struct page *page) * Determine if a page has private stuff, indicating that release routines * should be invoked upon it. */ -#define page_has_private(page) \ - ((page)->flags & ((1 << PG_private) | \ - (1 << PG_private_2))) +static inline int page_has_private(struct page *page) +{ + return !!(page->flags & PAGE_FLAGS_PRIVATE); +} + +#endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/include/linux/pnp.h b/include/linux/pnp.h index b063c7328ba..fddfafaed02 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -360,6 +360,7 @@ struct pnp_driver { unsigned int flags; int (*probe) (struct pnp_dev *dev, const struct pnp_device_id *dev_id); void (*remove) (struct pnp_dev *dev); + void (*shutdown) (struct pnp_dev *dev); int (*suspend) (struct pnp_dev *dev, pm_message_t state); int (*resume) (struct pnp_dev *dev); struct device_driver driver; diff --git a/include/linux/poison.h b/include/linux/poison.h index 6729f7dcd60..7fc194aef8c 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -65,6 +65,9 @@ #define MUTEX_DEBUG_INIT 0x11 #define MUTEX_DEBUG_FREE 0x22 +/********** lib/flex_array.c **********/ +#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */ + /********** security/ **********/ #define KEY_DESTROY 0xbd diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 26361c4c037..3ebb2315364 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -135,8 +135,8 @@ static inline int sb_any_quota_active(struct super_block *sb) /* * Operations supported for diskquotas. */ -extern struct dquot_operations dquot_operations; -extern struct quotactl_ops vfs_quotactl_ops; +extern const struct dquot_operations dquot_operations; +extern const struct quotactl_ops vfs_quotactl_ops; #define sb_dquot_ops (&dquot_operations) #define sb_quotactl_ops (&vfs_quotactl_ops) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf116d0dbf2..477841d29fc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -71,14 +71,10 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *); -#ifdef CONFIG_DEBUG_VM -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); -#else -static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) +static inline void page_dup_rmap(struct page *page) { atomic_inc(&page->_mapcount); } -#endif /* * Called from mm/vmscan.c to handle paging out diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fe351c3914..97b10da0a3e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -140,6 +140,10 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +extern unsigned long nr_iowait_cpu(void); +extern unsigned long this_cpu_load(void); + + extern void calc_global_load(void); extern u64 cpu_nr_migrations(int cpu); @@ -434,7 +438,9 @@ extern int get_dumpable(struct mm_struct *mm); /* dumpable bits */ #define MMF_DUMPABLE 0 /* core dump is permitted */ #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ + #define MMF_DUMPABLE_BITS 2 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 @@ -444,6 +450,7 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_DUMP_ELF_HEADERS 6 #define MMF_DUMP_HUGETLB_PRIVATE 7 #define MMF_DUMP_HUGETLB_SHARED 8 + #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 7 #define MMF_DUMP_FILTER_MASK \ @@ -457,6 +464,10 @@ extern int get_dumpable(struct mm_struct *mm); #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif + /* leave room for more dump flags */ +#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) struct sighand_struct { atomic_t count; @@ -632,6 +643,8 @@ struct signal_struct { unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif + + int oom_adj; /* OOM kill score adjustment (bit shift) */ }; /* Context switch must be unlocked if interrupts are to be enabled */ @@ -1214,7 +1227,6 @@ struct task_struct { * a short time */ unsigned char fpu_counter; - s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif @@ -1713,7 +1725,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ +#define PF_OOM_ORIGIN 0x00080000 /* Allocating much memory to others */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 3f632182d8e..996df4dac7d 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -111,7 +111,7 @@ struct rpc_credops { void (*crdestroy)(struct rpc_cred *); int (*crmatch)(struct auth_cred *, struct rpc_cred *, int); - void (*crbind)(struct rpc_task *, struct rpc_cred *); + void (*crbind)(struct rpc_task *, struct rpc_cred *, int); __be32 * (*crmarshal)(struct rpc_task *, __be32 *); int (*crrefresh)(struct rpc_task *); __be32 * (*crvalidate)(struct rpc_task *, __be32 *); @@ -140,7 +140,7 @@ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred * void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); void rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int); -void rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *); +void rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int); void put_rpccred(struct rpc_cred *); void rpcauth_unbindcred(struct rpc_task *); __be32 * rpcauth_marshcred(struct rpc_task *, __be32 *); diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ab3f6e90caa..8ed9642a5a7 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -22,6 +22,7 @@ #include <linux/sunrpc/timer.h> #include <asm/signal.h> #include <linux/path.h> +#include <net/ipv6.h> struct rpc_inode; @@ -113,6 +114,7 @@ struct rpc_create_args { rpc_authflavor_t authflavor; unsigned long flags; char *client_name; + struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ }; /* Values for "flags" field */ @@ -188,5 +190,117 @@ static inline void rpc_set_port(struct sockaddr *sap, #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") +static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +static inline bool __rpc_copy_addr4(struct sockaddr *dst, + const struct sockaddr *src) +{ + const struct sockaddr_in *ssin = (struct sockaddr_in *) src; + struct sockaddr_in *dsin = (struct sockaddr_in *) dst; + + dsin->sin_family = ssin->sin_family; + dsin->sin_addr.s_addr = ssin->sin_addr.s_addr; + return true; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); +} + +static inline bool __rpc_copy_addr6(struct sockaddr *dst, + const struct sockaddr *src) +{ + const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src; + struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst; + + dsin6->sin6_family = ssin6->sin6_family; + ipv6_addr_copy(&dsin6->sin6_addr, &ssin6->sin6_addr); + return true; +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + return false; +} + +static inline bool __rpc_copy_addr6(struct sockaddr *dst, + const struct sockaddr *src) +{ + return false; +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + +/** + * rpc_cmp_addr - compare the address portion of two sockaddrs. + * @sap1: first sockaddr + * @sap2: second sockaddr + * + * Just compares the family and address portion. Ignores port, scope, etc. + * Returns true if the addrs are equal, false if they aren't. + */ +static inline bool rpc_cmp_addr(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + if (sap1->sa_family == sap2->sa_family) { + switch (sap1->sa_family) { + case AF_INET: + return __rpc_cmp_addr4(sap1, sap2); + case AF_INET6: + return __rpc_cmp_addr6(sap1, sap2); + } + } + return false; +} + +/** + * rpc_copy_addr - copy the address portion of one sockaddr to another + * @dst: destination sockaddr + * @src: source sockaddr + * + * Just copies the address portion and family. Ignores port, scope, etc. + * Caller is responsible for making certain that dst is large enough to hold + * the address in src. Returns true if address family is supported. Returns + * false otherwise. + */ +static inline bool rpc_copy_addr(struct sockaddr *dst, + const struct sockaddr *src) +{ + switch (src->sa_family) { + case AF_INET: + return __rpc_copy_addr4(dst, src); + case AF_INET6: + return __rpc_copy_addr6(dst, src); + } + return false; +} + +/** + * rpc_get_scope_id - return scopeid for a given sockaddr + * @sa: sockaddr to get scopeid from + * + * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if + * not an AF_INET6 address. + */ +static inline u32 rpc_get_scope_id(const struct sockaddr *sa) +{ + if (sa->sa_family != AF_INET6) + return 0; + + return ((struct sockaddr_in6 *) sa)->sin6_scope_id; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ea8009695c6..52e8cb0a756 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -94,8 +94,6 @@ struct svc_serv { struct module * sv_module; /* optional module to count when * adding threads */ svc_thread_fn sv_function; /* main function for threads */ - unsigned int sv_drc_max_pages; /* Total pages for DRC */ - unsigned int sv_drc_pages_used;/* DRC pages used */ #if defined(CONFIG_NFS_V4_1) struct list_head sv_cb_list; /* queue for callback requests * that arrive over the same diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 2223ae0b5ed..5f4e18b3ce7 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -65,6 +65,7 @@ struct svc_xprt { size_t xpt_locallen; /* length of address */ struct sockaddr_storage xpt_remote; /* remote peer's address */ size_t xpt_remotelen; /* length of address */ + struct rpc_wait_queue xpt_bc_pending; /* backchannel wait queue */ }; int svc_reg_xprt_class(struct svc_xprt_class *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 04dba23c59f..1b353a76c30 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -28,6 +28,7 @@ struct svc_sock { /* private TCP part */ u32 sk_reclen; /* length of record */ u32 sk_tcplen; /* current read length */ + struct rpc_xprt *sk_bc_xprt; /* NFSv4.1 backchannel xprt */ }; /* diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c090df44257..6f9457a75b8 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -124,6 +124,23 @@ struct rpc_xprt_ops { void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq); }; +/* + * RPC transport identifiers + * + * To preserve compatibility with the historical use of raw IP protocol + * id's for transport selection, UDP and TCP identifiers are specified + * with the previous values. No such restriction exists for new transports, + * except that they may not collide with these values (17 and 6, + * respectively). + */ +#define XPRT_TRANSPORT_BC (1 << 31) +enum xprt_transports { + XPRT_TRANSPORT_UDP = IPPROTO_UDP, + XPRT_TRANSPORT_TCP = IPPROTO_TCP, + XPRT_TRANSPORT_BC_TCP = IPPROTO_TCP | XPRT_TRANSPORT_BC, + XPRT_TRANSPORT_RDMA = 256 +}; + struct rpc_xprt { struct kref kref; /* Reference count */ struct rpc_xprt_ops * ops; /* transport methods */ @@ -179,6 +196,7 @@ struct rpc_xprt { spinlock_t reserve_lock; /* lock slot table */ u32 xid; /* Next XID value to use */ struct rpc_task * snd_task; /* Task blocked in send */ + struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ #if defined(CONFIG_NFS_V4_1) struct svc_serv *bc_serv; /* The RPC service which will */ /* process the callback */ @@ -231,6 +249,7 @@ struct xprt_create { struct sockaddr * srcaddr; /* optional local address */ struct sockaddr * dstaddr; /* remote peer address */ size_t addrlen; + struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ }; struct xprt_class { diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 54a379c9e8e..c2f04e1ae15 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -41,11 +41,6 @@ #define _LINUX_SUNRPC_XPRTRDMA_H /* - * RPC transport identifier for RDMA - */ -#define XPRT_TRANSPORT_RDMA 256 - -/* * rpcbind (v3+) RDMA netid. */ #define RPCBIND_NETID_RDMA "rdma" diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index c2a46c45c8f..3f14a02e9cc 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -13,17 +13,6 @@ int init_socket_xprt(void); void cleanup_socket_xprt(void); /* - * RPC transport identifiers for UDP, TCP - * - * To preserve compatibility with the historical use of raw IP protocol - * id's for transport selection, these are specified with the previous - * values. No such restriction exists for new transports, except that - * they may not collide with these values (17 and 6, respectively). - */ -#define XPRT_TRANSPORT_UDP IPPROTO_UDP -#define XPRT_TRANSPORT_TCP IPPROTO_TCP - -/* * RPC slot table sizes for UDP, TCP transports */ extern unsigned int xprt_udp_slot_table_entries; diff --git a/include/linux/swap.h b/include/linux/swap.h index 7c15334f3ff..6c990e658f4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -419,10 +419,22 @@ static inline swp_entry_t get_swap_page(void) } /* linux/mm/thrash.c */ -#define put_swap_token(mm) do { } while (0) -#define grab_swap_token(mm) do { } while (0) -#define has_swap_token(mm) 0 -#define disable_swap_token() do { } while (0) +static inline void put_swap_token(struct mm_struct *mm) +{ +} + +static inline void grab_swap_token(struct mm_struct *mm) +{ +} + +static inline int has_swap_token(struct mm_struct *mm) +{ + return 0; +} + +static inline void disable_swap_token(void) +{ +} static inline void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) diff --git a/include/linux/usb.h b/include/linux/usb.h index a8fe05f224e..19fabc487be 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1071,7 +1071,7 @@ typedef void (*usb_complete_t)(struct urb *); * @start_frame: Returns the initial frame for isochronous transfers. * @number_of_packets: Lists the number of ISO transfer buffers. * @interval: Specifies the polling interval for interrupt or isochronous - * transfers. The units are frames (milliseconds) for for full and low + * transfers. The units are frames (milliseconds) for full and low * speed devices, and microframes (1/8 millisecond) for highspeed ones. * @error_count: Returns the number of ISO transfers that reported errors. * @context: For use in completion functions. This normally points to diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 81a97cf8f0a..2d0f222388a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -166,15 +166,8 @@ static inline unsigned long zone_page_state(struct zone *zone, return x; } -extern unsigned long global_lru_pages(void); - -static inline unsigned long zone_lru_pages(struct zone *zone) -{ - return (zone_page_state(zone, NR_ACTIVE_ANON) - + zone_page_state(zone, NR_ACTIVE_FILE) - + zone_page_state(zone, NR_INACTIVE_ANON) - + zone_page_state(zone, NR_INACTIVE_FILE)); -} +extern unsigned long global_reclaimable_pages(void); +extern unsigned long zone_reclaimable_pages(struct zone *zone); #ifdef CONFIG_NUMA /* @@ -210,11 +203,6 @@ extern void zone_statistics(struct zone *, struct zone *); #endif /* CONFIG_NUMA */ -#define __add_zone_page_state(__z, __i, __d) \ - __mod_zone_page_state(__z, __i, __d) -#define __sub_zone_page_state(__z, __i, __d) \ - __mod_zone_page_state(__z, __i,-(__d)) - #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 6273fa97b52..7ef0c7b94f3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -94,7 +94,7 @@ struct execute_work { /* * initialize all of a work item in one go * - * NOTE! No point in using "atomic_long_set()": useing a direct + * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index 93885830430..c8f94e8db69 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -482,7 +482,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, * message. * @cm_id: Connection identifier associated with the connection message. * @service_timeout: The lower 5-bits specify the maximum time required for - * the sender to reply to to the connection message. The upper 3-bits + * the sender to reply to the connection message. The upper 3-bits * specify additional control flags. * @private_data: Optional user-defined private data sent with the * message receipt acknowledgement. diff --git a/include/scsi/fc/fc_fc2.h b/include/scsi/fc/fc_fc2.h index cff8a8c22f5..f87777d0d5b 100644 --- a/include/scsi/fc/fc_fc2.h +++ b/include/scsi/fc/fc_fc2.h @@ -92,8 +92,7 @@ struct fc_esb { __u8 _esb_resvd[4]; __u8 esb_service_params[112]; /* TBD */ __u8 esb_seq_status[8]; /* sequence statuses, 8 bytes each */ -} __attribute__((packed));; - +} __attribute__((packed)); /* * Define expected size for ASSERTs. diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 1493c541f9c..eaf46bdd18a 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -225,6 +225,169 @@ TRACE_EVENT(kmem_cache_free, TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) ); + +TRACE_EVENT(mm_page_free_direct, + + TP_PROTO(struct page *page, unsigned int order), + + TP_ARGS(page, order), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( unsigned int, order ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + ), + + TP_printk("page=%p pfn=%lu order=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->order) +); + +TRACE_EVENT(mm_pagevec_free, + + TP_PROTO(struct page *page, int cold), + + TP_ARGS(page, cold), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( int, cold ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->cold = cold; + ), + + TP_printk("page=%p pfn=%lu order=0 cold=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->cold) +); + +TRACE_EVENT(mm_page_alloc, + + TP_PROTO(struct page *page, unsigned int order, + gfp_t gfp_flags, int migratetype), + + TP_ARGS(page, order, gfp_flags, migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( unsigned int, order ) + __field( gfp_t, gfp_flags ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + __entry->gfp_flags = gfp_flags; + __entry->migratetype = migratetype; + ), + + TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s", + __entry->page, + page_to_pfn(__entry->page), + __entry->order, + __entry->migratetype, + show_gfp_flags(__entry->gfp_flags)) +); + +TRACE_EVENT(mm_page_alloc_zone_locked, + + TP_PROTO(struct page *page, unsigned int order, int migratetype), + + TP_ARGS(page, order, migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( unsigned int, order ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + __entry->migratetype = migratetype; + ), + + TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->order, + __entry->migratetype, + __entry->order == 0) +); + +TRACE_EVENT(mm_page_pcpu_drain, + + TP_PROTO(struct page *page, int order, int migratetype), + + TP_ARGS(page, order, migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( int, order ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->order = order; + __entry->migratetype = migratetype; + ), + + TP_printk("page=%p pfn=%lu order=%d migratetype=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->order, + __entry->migratetype) +); + +TRACE_EVENT(mm_page_alloc_extfrag, + + TP_PROTO(struct page *page, + int alloc_order, int fallback_order, + int alloc_migratetype, int fallback_migratetype), + + TP_ARGS(page, + alloc_order, fallback_order, + alloc_migratetype, fallback_migratetype), + + TP_STRUCT__entry( + __field( struct page *, page ) + __field( int, alloc_order ) + __field( int, fallback_order ) + __field( int, alloc_migratetype ) + __field( int, fallback_migratetype ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->alloc_order = alloc_order; + __entry->fallback_order = fallback_order; + __entry->alloc_migratetype = alloc_migratetype; + __entry->fallback_migratetype = fallback_migratetype; + ), + + TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", + __entry->page, + page_to_pfn(__entry->page), + __entry->alloc_order, + __entry->fallback_order, + pageblock_order, + __entry->alloc_migratetype, + __entry->fallback_migratetype, + __entry->fallback_order < pageblock_order, + __entry->alloc_migratetype == __entry->fallback_migratetype) +); + #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/init/main.c b/init/main.c index 34971becbd3..2c48c315316 100644 --- a/init/main.c +++ b/init/main.c @@ -668,12 +668,12 @@ asmlinkage void __init start_kernel(void) #endif thread_info_cache_init(); cred_init(); - fork_init(num_physpages); + fork_init(totalram_pages); proc_caches_init(); buffer_init(); key_init(); security_init(); - vfs_caches_init(num_physpages); + vfs_caches_init(totalram_pages); radix_tree_init(); signals_init(); /* rootfs populating might need page-writeback */ diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c5e68adc673..ee9d69707c0 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -77,7 +77,7 @@ struct mqueue_inode_info { static const struct inode_operations mqueue_dir_inode_operations; static const struct file_operations mqueue_file_operations; -static struct super_operations mqueue_super_ops; +static const struct super_operations mqueue_super_ops; static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; @@ -1224,7 +1224,7 @@ static const struct file_operations mqueue_file_operations = { .read = mqueue_read_file, }; -static struct super_operations mqueue_super_ops = { +static const struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, .destroy_inode = mqueue_destroy_inode, .statfs = simple_statfs, diff --git a/ipc/shm.c b/ipc/shm.c index 30162a59621..9eb1488b543 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -370,7 +370,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, size, acctflag, - &shp->mlock_user); + &shp->mlock_user, HUGETLB_SHMFS_INODE); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7ece8f027f..213b7f92fcd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -596,7 +596,7 @@ void cgroup_unlock(void) static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; +static const struct inode_operations cgroup_dir_inode_operations; static struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { @@ -961,7 +961,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return ret; } -static struct super_operations cgroup_ops = { +static const struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, @@ -1711,7 +1711,7 @@ static struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; -static struct inode_operations cgroup_dir_inode_operations = { +static const struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, diff --git a/kernel/fork.c b/kernel/fork.c index 2cebfb23b0b..1020977b57c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> @@ -136,9 +137,17 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + void free_task(struct task_struct *tsk) { prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -253,6 +262,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + return tsk; out: @@ -288,6 +300,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -424,7 +439,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); @@ -485,6 +501,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); + ksm_exit(mm); exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -863,6 +880,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); + sig->oom_adj = current->signal->oom_adj; + return 0; } diff --git a/kernel/module.c b/kernel/module.c index b6ee424245d..e6bc4b28aa6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> #include <linux/tracepoint.h> @@ -1535,6 +1536,10 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif } void *__symbol_get(const char *symbol) diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73b0ca..bcdef26e333 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -177,7 +177,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * - * The string is overwritten by the next call to print_taint(). + * The string is overwritten by the next call to print_tainted(). */ const char *print_tainted(void) { diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5d3f5..d3f722d20f9 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -40,7 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; -static int pidhash_shift; +static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) void __init pidhash_init(void) { int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/kernel/power/process.c b/kernel/power/process.c index da2072d7381..cc2e55373b6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 97955b0e44f..36cb168e433 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/sched.c b/kernel/sched.c index 91843ba7f23..0ac9053c21d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2904,6 +2904,19 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_iowait_cpu(void) +{ + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e7163460440..b416512ad17 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hidding of the automatic options. config TRACING bool diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca7d7c4d0c2..23b63859130 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_PARTIAL_LINE; } return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9489a0a9b1b..cc89be5bc0f 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : - * It is used to to delay the free of multiple probes array until a quiescent + * It is used to delay the free of multiple probes array until a quiescent * state is reached. * Tracepoint entries modifications are protected by the tracepoints_mutex. */ diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck index 603c81b6654..846e039a86b 100644 --- a/lib/Kconfig.kmemcheck +++ b/lib/Kconfig.kmemcheck @@ -1,6 +1,8 @@ config HAVE_ARCH_KMEMCHECK bool +if HAVE_ARCH_KMEMCHECK + menuconfig KMEMCHECK bool "kmemcheck: trap use of uninitialized memory" depends on DEBUG_KERNEL @@ -89,3 +91,4 @@ config KMEMCHECK_BITOPS_OK accesses where not all the bits are initialized at the same time. This may also hide some real bugs. +endif diff --git a/lib/flex_array.c b/lib/flex_array.c index 7baed2fc3bc..66eef2e4483 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -28,23 +28,6 @@ struct flex_array_part { char elements[FLEX_ARRAY_PART_SIZE]; }; -static inline int __elements_per_part(int element_size) -{ - return FLEX_ARRAY_PART_SIZE / element_size; -} - -static inline int bytes_left_in_base(void) -{ - int element_offset = offsetof(struct flex_array, parts); - int bytes_left = FLEX_ARRAY_BASE_SIZE - element_offset; - return bytes_left; -} - -static inline int nr_base_part_ptrs(void) -{ - return bytes_left_in_base() / sizeof(struct flex_array_part *); -} - /* * If a user requests an allocation which is small * enough, we may simply use the space in the @@ -54,7 +37,7 @@ static inline int nr_base_part_ptrs(void) static inline int elements_fit_in_base(struct flex_array *fa) { int data_size = fa->element_size * fa->total_nr_elements; - if (data_size <= bytes_left_in_base()) + if (data_size <= FLEX_ARRAY_BASE_BYTES_LEFT) return 1; return 0; } @@ -63,6 +46,7 @@ static inline int elements_fit_in_base(struct flex_array *fa) * flex_array_alloc - allocate a new flexible array * @element_size: the size of individual elements in the array * @total: total number of elements that this should hold + * @flags: page allocation flags to use for base array * * Note: all locking must be provided by the caller. * @@ -103,7 +87,8 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total, gfp_t flags) { struct flex_array *ret; - int max_size = nr_base_part_ptrs() * __elements_per_part(element_size); + int max_size = FLEX_ARRAY_NR_BASE_PTRS * + FLEX_ARRAY_ELEMENTS_PER_PART(element_size); /* max_size will end up 0 if element_size > PAGE_SIZE */ if (total > max_size) @@ -113,17 +98,21 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total, return NULL; ret->element_size = element_size; ret->total_nr_elements = total; + if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO)) + memset(ret->parts[0], FLEX_ARRAY_FREE, + FLEX_ARRAY_BASE_BYTES_LEFT); return ret; } static int fa_element_to_part_nr(struct flex_array *fa, unsigned int element_nr) { - return element_nr / __elements_per_part(fa->element_size); + return element_nr / FLEX_ARRAY_ELEMENTS_PER_PART(fa->element_size); } /** * flex_array_free_parts - just free the second-level pages + * @fa: the flex array from which to free parts * * This is to be used in cases where the base 'struct flex_array' * has been statically allocated and should not be free. @@ -131,11 +120,10 @@ static int fa_element_to_part_nr(struct flex_array *fa, void flex_array_free_parts(struct flex_array *fa) { int part_nr; - int max_part = nr_base_part_ptrs(); if (elements_fit_in_base(fa)) return; - for (part_nr = 0; part_nr < max_part; part_nr++) + for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) kfree(fa->parts[part_nr]); } @@ -150,7 +138,8 @@ static unsigned int index_inside_part(struct flex_array *fa, { unsigned int part_offset; - part_offset = element_nr % __elements_per_part(fa->element_size); + part_offset = element_nr % + FLEX_ARRAY_ELEMENTS_PER_PART(fa->element_size); return part_offset * fa->element_size; } @@ -159,15 +148,12 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) { struct flex_array_part *part = fa->parts[part_nr]; if (!part) { - /* - * This leaves the part pages uninitialized - * and with potentially random data, just - * as if the user had kmalloc()'d the whole. - * __GFP_ZERO can be used to zero it. - */ - part = kmalloc(FLEX_ARRAY_PART_SIZE, flags); + part = kmalloc(sizeof(struct flex_array_part), flags); if (!part) return NULL; + if (!(flags & __GFP_ZERO)) + memset(part, FLEX_ARRAY_FREE, + sizeof(struct flex_array_part)); fa->parts[part_nr] = part; } return part; @@ -175,9 +161,12 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) /** * flex_array_put - copy data into the array at @element_nr - * @src: address of data to copy into the array + * @fa: the flex array to copy data into * @element_nr: index of the position in which to insert * the new element. + * @src: address of data to copy into the array + * @flags: page allocation flags to use for array expansion + * * * Note that this *copies* the contents of @src into * the array. If you are trying to store an array of @@ -207,9 +196,38 @@ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, } /** + * flex_array_clear - clear element in array at @element_nr + * @fa: the flex array of the element. + * @element_nr: index of the position to clear. + * + * Locking must be provided by the caller. + */ +int flex_array_clear(struct flex_array *fa, unsigned int element_nr) +{ + int part_nr = fa_element_to_part_nr(fa, element_nr); + struct flex_array_part *part; + void *dst; + + if (element_nr >= fa->total_nr_elements) + return -ENOSPC; + if (elements_fit_in_base(fa)) + part = (struct flex_array_part *)&fa->parts[0]; + else { + part = fa->parts[part_nr]; + if (!part) + return -EINVAL; + } + dst = &part->elements[index_inside_part(fa, element_nr)]; + memset(dst, FLEX_ARRAY_FREE, fa->element_size); + return 0; +} + +/** * flex_array_prealloc - guarantee that array space exists + * @fa: the flex array for which to preallocate parts * @start: index of first array element for which space is allocated * @end: index of last (inclusive) element for which space is allocated + * @flags: page allocation flags * * This will guarantee that no future calls to flex_array_put() * will allocate memory. It can be used if you are expecting to @@ -242,6 +260,7 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, /** * flex_array_get - pull data back out of the array + * @fa: the flex array from which to extract data * @element_nr: index of the element to fetch from the array * * Returns a pointer to the data at index @element_nr. Note @@ -266,3 +285,43 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr) } return &part->elements[index_inside_part(fa, element_nr)]; } + +static int part_is_free(struct flex_array_part *part) +{ + int i; + + for (i = 0; i < sizeof(struct flex_array_part); i++) + if (part->elements[i] != FLEX_ARRAY_FREE) + return 0; + return 1; +} + +/** + * flex_array_shrink - free unused second-level pages + * @fa: the flex array to shrink + * + * Frees all second-level pages that consist solely of unused + * elements. Returns the number of pages freed. + * + * Locking must be provided by the caller. + */ +int flex_array_shrink(struct flex_array *fa) +{ + struct flex_array_part *part; + int part_nr; + int ret = 0; + + if (elements_fit_in_base(fa)) + return ret; + for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) { + part = fa->parts[part_nr]; + if (!part) + continue; + if (part_is_free(part)) { + fa->parts[part_nr] = NULL; + kfree(part); + ret++; + } + } + return ret; +} diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d320c1816a7..73a14b8c6d1 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1092,13 +1092,8 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ - if (unlikely((int) size < 0)) { - /* There can be only one.. */ - static char warn = 1; - WARN_ON(warn); - warn = 0; + if (WARN_ON_ONCE((int) size < 0)) return 0; - } str = buf; end = buf + size; @@ -1544,13 +1539,8 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) struct printf_spec spec = {0}; - if (unlikely((int) size < 0)) { - /* There can be only one.. */ - static char warn = 1; - WARN_ON(warn); - warn = 0; + if (WARN_ON_ONCE((int) size < 0)) return 0; - } str = buf; end = buf + size; diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index c3e4a2baf83..46a31e5f49c 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -135,7 +135,7 @@ static const config configuration_table[10] = { /* =========================================================================== * Update a hash value with the given input byte - * IN assertion: all calls to to UPDATE_HASH are made with consecutive + * IN assertion: all calls to UPDATE_HASH are made with consecutive * input characters, so that a running hash key can be computed from the * previous key instead of complete recalculation each time. */ @@ -146,7 +146,7 @@ static const config configuration_table[10] = { * Insert string str in the dictionary and set match_head to the previous head * of the hash chain (the most recent string with same hash key). Return * the previous length of the hash chain. - * IN assertion: all calls to to INSERT_STRING are made with consecutive + * IN assertion: all calls to INSERT_STRING are made with consecutive * input characters and the first MIN_MATCH bytes of str are valid * (except for the last MIN_MATCH-1 bytes of the input file). */ diff --git a/mm/Kconfig b/mm/Kconfig index 3aa519f52e1..71eb0b4cce8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT config MMU_NOTIFIER bool +config KSM + bool "Enable KSM for page merging" + depends on MMU + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single resident page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information. + config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" default 4096 diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index aa99fd1f710..af7cfb43d2f 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types - of memory corruptions. + of memory corruption. config WANT_PAGE_DEBUG_FLAGS bool @@ -17,11 +17,11 @@ config PAGE_POISONING depends on !HIBERNATION select DEBUG_PAGEALLOC select WANT_PAGE_DEBUG_FLAGS - help + ---help--- Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruptions. + but helps to find certain types of memory corruption. - This option cannot enalbe with hibernation. Otherwise, it will get - wrong messages for memory corruption because the free pages are not - saved to the suspend image. + This option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index ea4b18bd396..728a9fde49d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o $(mmu-y) + page_isolation.o mm_init.o mmu_context.o $(mmu-y) obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o @@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o diff --git a/mm/filemap.c b/mm/filemap.c index dd51c68e2b8..bcc7372aebb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -119,6 +119,8 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* @@ -431,6 +433,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b16d6363477..815dbd4a6dc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(struct hstate *h) -{ - int nid; - struct page *page = NULL; - - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - break; - } - } - return page; -} - static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) @@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) /* * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: + * copy it back to next_nid_to_alloc afterwards: * otherwise there's a window in which a racer might * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really @@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ -static int hstate_next_node(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h) { int next_nid; - next_nid = next_node(h->hugetlb_next_nid, node_online_map); + next_nid = next_node(h->next_nid_to_alloc, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + h->next_nid_to_alloc = next_nid; return next_nid; } @@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) int next_nid; int ret = 0; - start_nid = h->hugetlb_next_nid; + start_nid = h->next_nid_to_alloc; + next_nid = start_nid; do { - page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, next_nid); if (page) ret = 1; - next_nid = hstate_next_node(h); - } while (!page && h->hugetlb_next_nid != start_nid); + next_nid = hstate_next_node_to_alloc(h); + } while (!page && next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) return ret; } +/* + * helper for free_pool_huge_page() - find next node + * from which to free a huge page + */ +static int hstate_next_node_to_free(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->next_nid_to_free, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->next_nid_to_free = next_nid; + return next_nid; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +{ + int start_nid; + int next_nid; + int ret = 0; + + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && + !list_empty(&h->hugepage_freelists[next_nid])) { + struct page *page = + list_entry(h->hugepage_freelists[next_nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[next_nid]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[next_nid]--; + } + update_and_free_page(h, page); + ret = 1; + } + next_nid = hstate_next_node_to_free(h); + } while (!ret && next_nid != start_nid); + + return ret; +} + static struct page *alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { @@ -855,22 +893,13 @@ free: * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. + * Called with hugetlb_lock held. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { - static int nid = -1; - struct page *page; unsigned long nr_pages; - /* - * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - */ - unsigned long remaining_iterations = nr_online_nodes; - /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, nr_pages = min(unused_resv_pages, h->surplus_huge_pages); - while (remaining_iterations-- && nr_pages) { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - if (!h->surplus_huge_pages_node[nid]) - continue; - - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; - nr_pages--; - remaining_iterations = nr_online_nodes; - } + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the frees across the + * on-line nodes for us and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, 1)) + break; } } @@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->hugetlb_next_nid), + NODE_DATA(h->next_nid_to_alloc), huge_page_size(h), huge_page_size(h), 0); + hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - hstate_next_node(h); nr_nodes--; } return 0; @@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) */ static int adjust_pool_surplus(struct hstate *h, int delta) { - static int prev_nid; - int nid = prev_nid; + int start_nid, next_nid; int ret = 0; VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= + if (delta < 0) + start_nid = h->next_nid_to_alloc; + else + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + int nid = next_nid; + if (delta < 0) { + next_nid = hstate_next_node_to_alloc(h); + /* + * To shrink on this node, there must be a surplus page + */ + if (!h->surplus_huge_pages_node[nid]) + continue; + } + if (delta > 0) { + next_nid = hstate_next_node_to_free(h); + /* + * Surplus cannot exceed the total number of pages + */ + if (h->surplus_huge_pages_node[nid] >= h->nr_huge_pages_node[nid]) - continue; + continue; + } h->surplus_huge_pages += delta; h->surplus_huge_pages_node[nid] += delta; ret = 1; break; - } while (nid != prev_nid); + } while (next_nid != start_nid); - prev_nid = nid; return ret; } @@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) min_count = max(count, min_count); try_to_free_low(h, min_count); while (min_count < persistent_huge_pages(h)) { - struct page *page = dequeue_huge_page(h); - if (!page) + if (!free_pool_huge_page(h, 0)) break; - update_and_free_page(h, page); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, 1)) @@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->hugetlb_next_nid = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_online_map); + h->next_nid_to_free = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, return find_lock_page(mapping, idx); } +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags) { @@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -static int huge_zeropage_ok(pte_t *ptep, int write, int shared) -{ - if (!ptep || write || shared) - return 0; - else - return huge_pte_none(huge_ptep_get(ptep)); -} - int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, - int write) + unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); - int zeropage_ok = 0; - int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + int absent; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the + * each hugepage. We have to make sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (huge_zeropage_ok(pte, write, shared)) - zeropage_ok = 1; + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + remainder = 0; + break; + } - if (!pte || - (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || - (write && !pte_write(huge_ptep_get(pte)))) { + if (absent || + ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, write); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; - if (!i) - i = -EFAULT; break; } @@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - if (zeropage_ok) - pages[i] = ZERO_PAGE(0); - else - pages[i] = mem_map_offset(page, pfn_offset); + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } @@ -2262,7 +2311,7 @@ same_page: *length = remainder; *position = vaddr; - return i; + return i ? i : -EFAULT; } void hugetlb_change_protection(struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index f290c4db528..22ec8d2b0fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,8 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +extern unsigned long highest_memmap_pfn; + /* * in mm/vmscan.c: */ @@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); @@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define GUP_FLAGS_WRITE 0x1 -#define GUP_FLAGS_FORCE 0x2 -#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 -#define GUP_FLAGS_IGNORE_SIGKILL 0x8 - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, + unsigned long start, int len, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas); #define ZONE_RECLAIM_NOSCAN -2 diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 00000000000..37cc3732509 --- /dev/null +++ b/mm/ksm.c @@ -0,0 +1,1703 @@ +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/spinlock.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/mmu_notifier.h> +#include <linux/ksm.h> + +#include <asm/tlbflush.h> + +/* + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + */ + +/** + * struct mm_slot - ksm information per mm that is being scanned + * @link: link to the mm_slots hash list + * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @rmap_list: head for this mm_slot's list of rmap_items + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node link; + struct list_head mm_list; + struct list_head rmap_list; + struct mm_struct *mm; +}; + +/** + * struct ksm_scan - cursor for scanning + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * @rmap_item: the current rmap that we are scanning inside the rmap_list + * @seqnr: count of completed full scans (needed when removing unstable node) + * + * There is only the one ksm_scan instance of this cursor structure. + */ +struct ksm_scan { + struct mm_slot *mm_slot; + unsigned long address; + struct rmap_item *rmap_item; + unsigned long seqnr; +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @link: link into mm_slot's rmap_list (rmap_list is per mm) + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @oldchecksum: previous checksum of the page at that virtual address + * @node: rb_node of this rmap_item in either unstable or stable tree + * @next: next rmap_item hanging off the same node of the stable tree + * @prev: previous rmap_item hanging off the same node of the stable tree + */ +struct rmap_item { + struct list_head link; + struct mm_struct *mm; + unsigned long address; /* + low bits used for flags below */ + union { + unsigned int oldchecksum; /* when unstable */ + struct rmap_item *next; /* when stable */ + }; + union { + struct rb_node node; /* when tree node */ + struct rmap_item *prev; /* in stable list */ + }; +}; + +#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ +#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ + +/* The stable and unstable tree heads */ +static struct rb_root root_stable_tree = RB_ROOT; +static struct rb_root root_unstable_tree = RB_ROOT; + +#define MM_SLOTS_HASH_HEADS 1024 +static struct hlist_head *mm_slots_hash; + +static struct mm_slot ksm_mm_head = { + .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), +}; +static struct ksm_scan ksm_scan = { + .mm_slot = &ksm_mm_head, +}; + +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *mm_slot_cache; + +/* The number of nodes in the stable tree */ +static unsigned long ksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long ksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + +/* Limit on the number of unswappable pages used */ +static unsigned long ksm_max_kernel_pages = 2000; + +/* Number of pages ksmd should scan in one batch */ +static unsigned int ksm_thread_pages_to_scan = 200; + +/* Milliseconds ksmd should sleep between batches */ +static unsigned int ksm_thread_sleep_millisecs = 20; + +#define KSM_RUN_STOP 0 +#define KSM_RUN_MERGE 1 +#define KSM_RUN_UNMERGE 2 +static unsigned int ksm_run = KSM_RUN_MERGE; + +static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DEFINE_MUTEX(ksm_thread_mutex); +static DEFINE_SPINLOCK(ksm_mmlist_lock); + +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +static int __init ksm_slab_init(void) +{ + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + if (!mm_slot_cache) + goto out_free; + + return 0; + +out_free: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init ksm_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(rmap_item_cache); + mm_slot_cache = NULL; +} + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + ksm_rmap_items--; + rmap_item->mm = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, link) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + INIT_LIST_HEAD(&mm_slot->rmap_list); + hlist_add_head(&mm_slot->link, bucket); +} + +static inline int in_stable_tree(struct rmap_item *rmap_item) +{ + return rmap_item->address & STABLE_FLAG; +} + +/* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET); + if (!page) + break; + if (PageKsm(page)) + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static void break_cow(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + break_ksm(vma, addr); +out: + up_read(&mm->mmap_sem); +} + +static struct page *get_mergeable_page(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + struct page *page; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + + page = follow_page(vma, addr, FOLL_GET); + if (!page) + goto out; + if (PageAnon(page)) { + flush_anon_page(vma, page, addr); + flush_dcache_page(page); + } else { + put_page(page); +out: page = NULL; + } + up_read(&mm->mmap_sem); + return page; +} + +/* + * get_ksm_page: checks if the page at the virtual address in rmap_item + * is still PageKsm, in which case we can trust the content of the page, + * and it returns the gotten page; but NULL if the page has been zapped. + */ +static struct page *get_ksm_page(struct rmap_item *rmap_item) +{ + struct page *page; + + page = get_mergeable_page(rmap_item); + if (page && !PageKsm(page)) { + put_page(page); + page = NULL; + } + return page; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (in_stable_tree(rmap_item)) { + struct rmap_item *next_item = rmap_item->next; + + if (rmap_item->address & NODE_FLAG) { + if (next_item) { + rb_replace_node(&rmap_item->node, + &next_item->node, + &root_stable_tree); + next_item->address |= NODE_FLAG; + ksm_pages_sharing--; + } else { + rb_erase(&rmap_item->node, &root_stable_tree); + ksm_pages_shared--; + } + } else { + struct rmap_item *prev_item = rmap_item->prev; + + BUG_ON(prev_item->next != rmap_item); + prev_item->next = next_item; + if (next_item) { + BUG_ON(next_item->prev != rmap_item); + next_item->prev = rmap_item->prev; + } + ksm_pages_sharing--; + } + + rmap_item->next = NULL; + + } else if (rmap_item->address & NODE_FLAG) { + unsigned char age; + /* + * Usually ksmd can and must skip the rb_erase, because + * root_unstable_tree was already reset to RB_ROOT. + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. + */ + age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); + BUG_ON(age > 1); + if (!age) + rb_erase(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared--; + } + + rmap_item->address &= PAGE_MASK; + + cond_resched(); /* we're called from many long loops */ +} + +static void remove_trailing_rmap_items(struct mm_slot *mm_slot, + struct list_head *cur) +{ + struct rmap_item *rmap_item; + + while (cur != &mm_slot->rmap_list) { + rmap_item = list_entry(cur, struct rmap_item, link); + cur = cur->next; + remove_rmap_item_from_tree(rmap_item); + list_del(&rmap_item->link); + free_rmap_item(rmap_item); + } +} + +/* + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int unmerge_and_remove_all_rmap_items(void) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, + struct mm_slot, mm_list); + spin_unlock(&ksm_mmlist_lock); + + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; + } + + remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_test_exit(mm)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + } + + ksm_scan.seqnr = 0; + return 0; + +error: + up_read(&mm->mmap_sem); + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = &ksm_mm_head; + spin_unlock(&ksm_mmlist_lock); + return err; +} +#endif /* CONFIG_SYSFS */ + +static u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page, KM_USER0); + checksum = jhash2(addr, PAGE_SIZE / 4, 17); + kunmap_atomic(addr, KM_USER0); + return checksum; +} + +static int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1, KM_USER0); + addr2 = kmap_atomic(page2, KM_USER1); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2, KM_USER1); + kunmap_atomic(addr1, KM_USER0); + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out; + + if (pte_write(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesnt + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { + set_pte_at_notify(mm, addr, ptep, entry); + goto out_unlock; + } + entry = pte_wrprotect(entry); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out: + return err; +} + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to oldpage + * @oldpage: the page we are replacing by newpage + * @newpage: the ksm page we replace oldpage by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *oldpage, + struct page *newpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + pgprot_t prot; + int err = -EFAULT; + + prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); + + addr = page_address_in_vma(oldpage, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + get_page(newpage); + page_add_ksm_rmap(newpage); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); + + page_remove_rmap(oldpage); + put_page(oldpage); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out: + return err; +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * @vma: the vma that hold the pte pointing into oldpage + * @oldpage: the page that we want to replace with newpage + * @newpage: the page that we want to map instead of oldpage + * + * Note: + * oldpage should be a PageAnon page, while newpage should be a PageKsm page, + * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_one_page(struct vm_area_struct *vma, + struct page *oldpage, + struct page *newpage) +{ + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (!(vma->vm_flags & VM_MERGEABLE)) + goto out; + + if (!PageAnon(oldpage)) + goto out; + + get_page(newpage); + get_page(oldpage); + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(oldpage)) + goto out_putpage; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, oldpage, &orig_pte)) { + unlock_page(oldpage); + goto out_putpage; + } + unlock_page(oldpage); + + if (pages_identical(oldpage, newpage)) + err = replace_page(vma, oldpage, newpage, orig_pte); + +out_putpage: + put_page(oldpage); + put_page(newpage); +out: + return err; +} + +/* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + */ +static int try_to_merge_with_ksm_page(struct mm_struct *mm1, + unsigned long addr1, + struct page *page1, + struct page *kpage) +{ + struct vm_area_struct *vma; + int err = -EFAULT; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) + goto out; + + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) + goto out; + + err = try_to_merge_one_page(vma, page1, kpage); +out: + up_read(&mm1->mmap_sem); + return err; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them + * to be merged into one page. + * + * This function returns 0 if we successfully mapped two identical pages + * into one page, -EFAULT otherwise. + * + * Note that this function allocates a new kernel page: if one of the pages + * is already a ksm page, try_to_merge_with_ksm_page should be used. + */ +static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, + struct page *page1, struct mm_struct *mm2, + unsigned long addr2, struct page *page2) +{ + struct vm_area_struct *vma; + struct page *kpage; + int err = -EFAULT; + + /* + * The number of nodes in the stable tree + * is the number of kernel pages that we hold. + */ + if (ksm_max_kernel_pages && + ksm_max_kernel_pages <= ksm_pages_shared) + return err; + + kpage = alloc_page(GFP_HIGHUSER); + if (!kpage) + return err; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) { + up_read(&mm1->mmap_sem); + goto out; + } + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) { + up_read(&mm1->mmap_sem); + goto out; + } + + copy_user_highpage(kpage, page1, addr1, vma); + err = try_to_merge_one_page(vma, page1, kpage); + up_read(&mm1->mmap_sem); + + if (!err) { + err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); + /* + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. + */ + if (err) + break_cow(mm1, addr1); + } +out: + put_page(kpage); + return err; +} + +/* + * stable_tree_search - search page inside the stable tree + * @page: the page that we are searching identical pages to. + * @page2: pointer into identical page that we are holding inside the stable + * tree that we have found. + * @rmap_item: the reverse mapping item + * + * This function checks if there is a page inside the stable tree + * with identical content to the page that we are scanning right now. + * + * This function return rmap_item pointer to the identical item if found, + * NULL otherwise. + */ +static struct rmap_item *stable_tree_search(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node *node = root_stable_tree.rb_node; + + while (node) { + struct rmap_item *tree_rmap_item, *next_rmap_item; + int ret; + + tree_rmap_item = rb_entry(node, struct rmap_item, node); + while (tree_rmap_item) { + BUG_ON(!in_stable_tree(tree_rmap_item)); + cond_resched(); + page2[0] = get_ksm_page(tree_rmap_item); + if (page2[0]) + break; + next_rmap_item = tree_rmap_item->next; + remove_rmap_item_from_tree(tree_rmap_item); + tree_rmap_item = next_rmap_item; + } + if (!tree_rmap_item) + return NULL; + + ret = memcmp_pages(page, page2[0]); + + if (ret < 0) { + put_page(page2[0]); + node = node->rb_left; + } else if (ret > 0) { + put_page(page2[0]); + node = node->rb_right; + } else { + return tree_rmap_item; + } + } + + return NULL; +} + +/* + * stable_tree_insert - insert rmap_item pointing to new ksm page + * into the stable tree. + * + * @page: the page that we are searching identical page to inside the stable + * tree. + * @rmap_item: pointer to the reverse mapping item. + * + * This function returns rmap_item if success, NULL otherwise. + */ +static struct rmap_item *stable_tree_insert(struct page *page, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &root_stable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item, *next_rmap_item; + struct page *tree_page; + int ret; + + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + while (tree_rmap_item) { + BUG_ON(!in_stable_tree(tree_rmap_item)); + cond_resched(); + tree_page = get_ksm_page(tree_rmap_item); + if (tree_page) + break; + next_rmap_item = tree_rmap_item->next; + remove_rmap_item_from_tree(tree_rmap_item); + tree_rmap_item = next_rmap_item; + } + if (!tree_rmap_item) + return NULL; + + ret = memcmp_pages(page, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * It is not a bug that stable_tree_search() didn't + * find this node: because at that time our page was + * not yet write-protected, so may have changed since. + */ + return NULL; + } + } + + rmap_item->address |= NODE_FLAG | STABLE_FLAG; + rmap_item->next = NULL; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_stable_tree); + + ksm_pages_shared++; + return rmap_item; +} + +/* + * unstable_tree_search_insert - search and insert items into the unstable tree. + * + * @page: the page that we are going to search for identical page or to insert + * into the unstable tree + * @page2: pointer into identical page that was found inside the unstable tree + * @rmap_item: the reverse mapping item of page + * + * This function searches for a page in the unstable tree identical to the + * page currently being scanned; and if no identical page is found in the + * tree, we insert rmap_item as a new object into the unstable tree. + * + * This function returns pointer to rmap_item found to be identical + * to the currently scanned page, NULL otherwise. + * + * This function does both searching and inserting, because they share + * the same walking algorithm in an rbtree. + */ +static struct rmap_item *unstable_tree_search_insert(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item; + int ret; + + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + page2[0] = get_mergeable_page(tree_rmap_item); + if (!page2[0]) + return NULL; + + /* + * Don't substitute an unswappable ksm page + * just for one good swappable forked page. + */ + if (page == page2[0]) { + put_page(page2[0]); + return NULL; + } + + ret = memcmp_pages(page, page2[0]); + + parent = *new; + if (ret < 0) { + put_page(page2[0]); + new = &parent->rb_left; + } else if (ret > 0) { + put_page(page2[0]); + new = &parent->rb_right; + } else { + return tree_rmap_item; + } + } + + rmap_item->address |= NODE_FLAG; + rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_unstable_tree); + + ksm_pages_unshared++; + return NULL; +} + +/* + * stable_tree_append - add another rmap_item to the linked list of + * rmap_items hanging off a given node of the stable tree, all sharing + * the same ksm page. + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item) +{ + rmap_item->next = tree_rmap_item->next; + rmap_item->prev = tree_rmap_item; + + if (tree_rmap_item->next) + tree_rmap_item->next->prev = rmap_item; + + tree_rmap_item->next = rmap_item; + rmap_item->address |= STABLE_FLAG; + + ksm_pages_sharing++; +} + +/* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +{ + struct page *page2[1]; + struct rmap_item *tree_rmap_item; + unsigned int checksum; + int err; + + if (in_stable_tree(rmap_item)) + remove_rmap_item_from_tree(rmap_item); + + /* We first start with searching the page inside the stable tree */ + tree_rmap_item = stable_tree_search(page, page2, rmap_item); + if (tree_rmap_item) { + if (page == page2[0]) /* forked */ + err = 0; + else + err = try_to_merge_with_ksm_page(rmap_item->mm, + rmap_item->address, + page, page2[0]); + put_page(page2[0]); + + if (!err) { + /* + * The page was successfully merged: + * add its rmap_item to the stable tree. + */ + stable_tree_append(rmap_item, tree_rmap_item); + } + return; + } + + /* + * A ksm page might have got here by fork, but its other + * references have already been removed from the stable tree. + * Or it might be left over from a break_ksm which failed + * when the mem_cgroup had reached its limit: try again now. + */ + if (PageKsm(page)) + break_cow(rmap_item->mm, rmap_item->address); + + /* + * In case the hash value of the page was changed from the last time we + * have calculated it, this page to be changed frequely, therefore we + * don't want to insert it to the unstable tree, and we don't want to + * waste our time to search if there is something identical to it there. + */ + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } + + tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); + if (tree_rmap_item) { + err = try_to_merge_two_pages(rmap_item->mm, + rmap_item->address, page, + tree_rmap_item->mm, + tree_rmap_item->address, page2[0]); + /* + * As soon as we merge this page, we want to remove the + * rmap_item of the page we have merged with from the unstable + * tree, and insert it instead as new node in the stable tree. + */ + if (!err) { + rb_erase(&tree_rmap_item->node, &root_unstable_tree); + tree_rmap_item->address &= ~NODE_FLAG; + ksm_pages_unshared--; + + /* + * If we fail to insert the page into the stable tree, + * we will have 2 virtual addresses that are pointing + * to a ksm page left outside the stable tree, + * in which case we need to break_cow on both. + */ + if (stable_tree_insert(page2[0], tree_rmap_item)) + stable_tree_append(rmap_item, tree_rmap_item); + else { + break_cow(tree_rmap_item->mm, + tree_rmap_item->address); + break_cow(rmap_item->mm, rmap_item->address); + } + } + + put_page(page2[0]); + } +} + +static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, + struct list_head *cur, + unsigned long addr) +{ + struct rmap_item *rmap_item; + + while (cur != &mm_slot->rmap_list) { + rmap_item = list_entry(cur, struct rmap_item, link); + if ((rmap_item->address & PAGE_MASK) == addr) { + if (!in_stable_tree(rmap_item)) + remove_rmap_item_from_tree(rmap_item); + return rmap_item; + } + if (rmap_item->address > addr) + break; + cur = cur->next; + remove_rmap_item_from_tree(rmap_item); + list_del(&rmap_item->link); + free_rmap_item(rmap_item); + } + + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ + rmap_item->mm = mm_slot->mm; + rmap_item->address = addr; + list_add_tail(&rmap_item->link, cur); + } + return rmap_item; +} + +static struct rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct mm_slot *slot; + struct vm_area_struct *vma; + struct rmap_item *rmap_item; + + if (list_empty(&ksm_mm_head.mm_list)) + return NULL; + + slot = ksm_scan.mm_slot; + if (slot == &ksm_mm_head) { + root_unstable_tree = RB_ROOT; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + ksm_scan.mm_slot = slot; + spin_unlock(&ksm_mmlist_lock); +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); + } + + mm = slot->mm; + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_MERGEABLE)) + continue; + if (ksm_scan.address < vma->vm_start) + ksm_scan.address = vma->vm_start; + if (!vma->anon_vma) + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; + *page = follow_page(vma, ksm_scan.address, FOLL_GET); + if (*page && PageAnon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); + rmap_item = get_next_rmap_item(slot, + ksm_scan.rmap_item->link.next, + ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_item = rmap_item; + ksm_scan.address += PAGE_SIZE; + } else + put_page(*page); + up_read(&mm->mmap_sem); + return rmap_item; + } + if (*page) + put_page(*page); + ksm_scan.address += PAGE_SIZE; + cond_resched(); + } + } + + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); + } + /* + * Nuke all the rmap_items that are above this current rmap: + * because there were no VM_MERGEABLE vmas with such addresses. + */ + remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hlist_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + + /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; + if (slot != &ksm_mm_head) + goto next_mm; + + ksm_scan.seqnr++; + return NULL; +} + +/** + * ksm_do_scan - the ksm scanner main worker function. + * @scan_npages - number of pages we want to scan before we return. + */ +static void ksm_do_scan(unsigned int scan_npages) +{ + struct rmap_item *rmap_item; + struct page *page; + + while (scan_npages--) { + cond_resched(); + rmap_item = scan_get_next_rmap_item(&page); + if (!rmap_item) + return; + if (!PageKsm(page) || !in_stable_tree(rmap_item)) + cmp_and_merge_page(page, rmap_item); + else if (page_mapcount(page) == 1) { + /* + * Replace now-unshared ksm page by ordinary page. + */ + break_cow(rmap_item->mm, rmap_item->address); + remove_rmap_item_from_tree(rmap_item); + rmap_item->oldchecksum = calc_checksum(page); + } + put_page(page); + } +} + +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); +} + +static int ksm_scan_thread(void *nothing) +{ + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&ksm_thread_mutex); + if (ksmd_should_run()) + ksm_do_scan(ksm_thread_pages_to_scan); + mutex_unlock(&ksm_thread_mutex); + + if (ksmd_should_run()) { + schedule_timeout_interruptible( + msecs_to_jiffies(ksm_thread_sleep_millisecs)); + } else { + wait_event_interruptible(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return 0; /* just ignore the advice */ + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +int __ksm_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int needs_wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.mm_list); + + spin_lock(&ksm_mmlist_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little; when fork is followed by immediate exec, we don't + * want ksmd to waste time setting up and tearing down an rmap_list. + */ + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + + return 0; +} + +void __ksm_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int easy_to_free = 0; + + /* + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. + */ + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (list_empty(&mm_slot->rmap_list)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define KSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define KSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_thread_sleep_millisecs = msecs; + + return count; +} +KSM_ATTR(sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); +} + +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err || nr_pages > UINT_MAX) + return -EINVAL; + + ksm_thread_pages_to_scan = nr_pages; + + return count; +} +KSM_ATTR(pages_to_scan); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = strict_strtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > KSM_RUN_UNMERGE) + return -EINVAL; + + /* + * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. + * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, + * breaking COW to free the unswappable pages_shared (but leaves + * mm_slots on the list for when ksmd may be set running again). + */ + + mutex_lock(&ksm_thread_mutex); + if (ksm_run != flags) { + ksm_run = flags; + if (flags & KSM_RUN_UNMERGE) { + current->flags |= PF_OOM_ORIGIN; + err = unmerge_and_remove_all_rmap_items(); + current->flags &= ~PF_OOM_ORIGIN; + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } + } + mutex_unlock(&ksm_thread_mutex); + + if (flags & KSM_RUN_MERGE) + wake_up_interruptible(&ksm_thread_wait); + + return count; +} +KSM_ATTR(run); + +static ssize_t max_kernel_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err) + return -EINVAL; + + ksm_max_kernel_pages = nr_pages; + + return count; +} + +static ssize_t max_kernel_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_max_kernel_pages); +} +KSM_ATTR(max_kernel_pages); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_shared); +} +KSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_sharing); +} +KSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sprintf(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + +static struct attribute *ksm_attrs[] = { + &sleep_millisecs_attr.attr, + &pages_to_scan_attr.attr, + &run_attr.attr, + &max_kernel_pages_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, + NULL, +}; + +static struct attribute_group ksm_attr_group = { + .attrs = ksm_attrs, + .name = "ksm", +}; +#endif /* CONFIG_SYSFS */ + +static int __init ksm_init(void) +{ + struct task_struct *ksm_thread; + int err; + + err = ksm_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) + goto out_free1; + + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + if (IS_ERR(ksm_thread)) { + printk(KERN_ERR "ksm: creating kthread failed\n"); + err = PTR_ERR(ksm_thread); + goto out_free2; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &ksm_attr_group); + if (err) { + printk(KERN_ERR "ksm: register sysfs failed\n"); + kthread_stop(ksm_thread); + goto out_free2; + } +#endif /* CONFIG_SYSFS */ + + return 0; + +out_free2: + mm_slots_hash_free(); +out_free1: + ksm_slab_free(); +out: + return err; +} +module_init(ksm_init) diff --git a/mm/madvise.c b/mm/madvise.c index 76eb4193acd..d9ae2067952 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include <linux/mempolicy.h> #include <linux/hugetlb.h> #include <linux/sched.h> +#include <linux/ksm.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags; + unsigned long new_flags = vma->vm_flags; switch (behavior) { case MADV_NORMAL: @@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } new_flags &= ~VM_DONTCOPY; break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -211,37 +222,16 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - long error; - switch (behavior) { - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - break; - } - case MADV_DONTFORK: - case MADV_NORMAL: - case MADV_SEQUENTIAL: - case MADV_RANDOM: - error = madvise_behavior(vma, prev, start, end, behavior); - break; case MADV_REMOVE: - error = madvise_remove(vma, prev, start, end); - break; - + return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: - error = madvise_willneed(vma, prev, start, end); - break; - + return madvise_willneed(vma, prev, start, end); case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); - break; - + return madvise_dontneed(vma, prev, start, end); default: - BUG(); - break; + return madvise_behavior(vma, prev, start, end, behavior); } - return error; } static int @@ -256,12 +246,17 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif return 1; default: return 0; } } + /* * The madvise(2) system call. * @@ -286,6 +281,12 @@ madvise_behavior_valid(int behavior) * so the kernel can free resources associated with it. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * * return values: * zero - success diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd4529d86de..9b10d875378 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -648,7 +648,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * !!file + !!active; + int lru = LRU_FILE * file + active; int ret; BUG_ON(!mem_cont); diff --git a/mm/memory.c b/mm/memory.c index e8f63d9961e..b1443ac07c0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -45,6 +45,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/module.h> #include <linux/delayacct.h> @@ -107,6 +108,18 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); +unsigned long zero_pfn __read_mostly; +unsigned long highest_memmap_pfn __read_mostly; + +/* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +core_initcall(init_zero_pfn); /* * If a p?d_bad entry is found while walking page tables, report @@ -443,6 +456,20 @@ static inline int is_cow_mapping(unsigned int flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef is_zero_pfn +static inline int is_zero_pfn(unsigned long pfn) +{ + return pfn == zero_pfn; +} +#endif + +#ifndef my_zero_pfn +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return zero_pfn; +} +#endif + /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -498,7 +525,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; - if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (!is_zero_pfn(pfn)) print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -520,6 +549,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -597,8 +628,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page, vma, addr); - rss[!!PageAnon(page)]++; + page_dup_rmap(page); + rss[PageAnon(page)]++; } out_set_pte: @@ -1143,9 +1174,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; + page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) - goto bad_page; + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } if (flags & FOLL_GET) get_page(page); @@ -1173,65 +1209,46 @@ no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return page; - /* Fall through to ZERO_PAGE handling */ + no_page_table: /* * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate page tables. + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. */ - if (flags & FOLL_ANON) { - page = ZERO_PAGE(0); - if (flags & FOLL_GET) - get_page(page); - BUG_ON(flags & FOLL_WRITE); - } + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); return page; } -/* Can we do the FOLL_ANON optimization? */ -static inline int use_zero_page(struct vm_area_struct *vma) -{ - /* - * We don't want to optimize FOLL_ANON for make_pages_present() - * when it tries to page in a VM_LOCKED region. As to VM_SHARED, - * we want to get the page from the page tables to make sure - * that we serialize and update with any other user of that - * mapping. - */ - if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) - return 0; - /* - * And if we have a fault routine, it's not an anonymous region. - */ - return !vma->vm_ops || !vma->vm_ops->fault; -} - - - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int flags, + unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags = 0; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); - int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); + unsigned long vm_flags; if (nr_pages <= 0) return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + /* * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (gup_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (gup_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; - unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -1243,7 +1260,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_t *pte; /* user gate pages are read-only */ - if (!ignore && write) + if (gup_flags & FOLL_WRITE) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1277,38 +1294,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - (!ignore && !(vm_flags & vma->vm_flags))) + !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, write); + &start, &nr_pages, i, gup_flags); continue; } - foll_flags = FOLL_TOUCH; - if (pages) - foll_flags |= FOLL_GET; - if (!write && use_zero_page(vma)) - foll_flags |= FOLL_ANON; - do { struct page *page; + unsigned int foll_flags = gup_flags; /* * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory, unless - * current is handling munlock--e.g., on exit. In - * that case, we are not allocating memory. Rather, - * we're only unlocking already resident/mapped pages. + * pages and potentially allocating memory. */ - if (unlikely(!ignore_sigkill && - fatal_signal_pending(current))) + if (unlikely(fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; - if (write) - foll_flags |= FOLL_WRITE; - cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; @@ -1419,18 +1424,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; + int flags = FOLL_TOUCH; + if (pages) + flags |= FOLL_GET; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } - EXPORT_SYMBOL(get_user_pages); +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { @@ -1608,7 +1642,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather - * than insert_pfn). + * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP + * without pte special, it would there be refcounted as a normal page. */ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { struct page *page; @@ -1974,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page)) { + if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); @@ -2075,10 +2110,19 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; - VM_BUG_ON(old_page == ZERO_PAGE(0)); - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; + + if (is_zero_pfn(pte_pfn(orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + } + __SetPageUptodate(new_page); + /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. @@ -2088,8 +2132,6 @@ gotten: clear_page_mlock(old_page); unlock_page(old_page); } - cow_user_page(new_page, old_page, address, vma); - __SetPageUptodate(new_page); if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2115,9 +2157,14 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); - set_pte_at(mm, address, page_table, entry); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + set_pte_at_notify(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { /* @@ -2625,6 +2672,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; + if (!(flags & FAULT_FLAG_WRITE)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + vma->vm_page_prot)); + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (!pte_none(*page_table)) + goto unlock; + goto setpte; + } + /* Allocate our own private page. */ pte_unmap(page_table); @@ -2639,13 +2696,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; + inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); +setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4412a676c8..efe3e0ec2e6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages); void online_page(struct page *page) { + unsigned long pfn = page_to_pfn(page); + totalram_pages++; - num_physpages++; + if (pfn >= num_physpages) + num_physpages = pfn + 1; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) @@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + zone_pcp_update(zone); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -831,7 +835,6 @@ repeat: zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; totalram_pages -= offlined_pages; - num_physpages -= offlined_pages; setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); diff --git a/mm/mempool.c b/mm/mempool.c index 32e75d40050..1a3bc3d4d55 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) } EXPORT_SYMBOL(mempool_kmalloc); -void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kzalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kzalloc); - void mempool_kfree(void *element, void *pool_data) { kfree(element); diff --git a/mm/migrate.c b/mm/migrate.c index 939888f9dda..16052e80aaa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); count++; } @@ -147,7 +149,7 @@ out: static void remove_file_migration_ptes(struct page *old, struct page *new) { struct vm_area_struct *vma; - struct address_space *mapping = page_mapping(new); + struct address_space *mapping = new->mapping; struct prio_tree_iter iter; pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + !!page_has_private(page); + expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - + if (PageSwapBacked(page)) { + __dec_zone_page_state(page, NR_SHMEM); + __inc_zone_page_state(newpage, NR_SHMEM); + } spin_unlock_irq(&mapping->tree_lock); return 0; @@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * needs to be effective. */ try_to_free_buffers(page); + goto rcu_unlock; } - goto rcu_unlock; + goto skip_unmap; } /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); +skip_unmap: if (!page_mapped(page)) rc = move_to_new_page(newpage, page); @@ -693,6 +700,8 @@ unlock: * restored. */ list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); } @@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc; + unsigned long flags; + + local_irq_save(flags); + list_for_each_entry(page, from, lru) + __inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + local_irq_restore(flags); if (!swapwrite) current->flags |= PF_SWAPWRITE; diff --git a/mm/mlock.c b/mm/mlock.c index 45eb650b965..bd6f0e466f6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page) } /** - * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address - * @mlock: 0 indicate munlock, otherwise mlock. * - * If @mlock == 0, unlock an mlocked range; - * else mlock the range of pages. This takes care of making the pages present , - * too. + * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) + unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; int ret = 0; - int gup_flags = 0; + int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON(start < vma->vm_start); VM_BUG_ON(end > vma->vm_end); - VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && - (atomic_read(&mm->mm_users) != 0)); - - /* - * mlock: don't page populate if vma has PROT_NONE permission. - * munlock: always do munlock although the vma has PROT_NONE - * permission, or SIGKILL is pending. - */ - if (!mlock) - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | - GUP_FLAGS_IGNORE_SIGKILL; + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + gup_flags = FOLL_TOUCH | FOLL_GET; if (vma->vm_flags & VM_WRITE) - gup_flags |= GUP_FLAGS_WRITE; + gup_flags |= FOLL_WRITE; while (nr_pages > 0) { int i; @@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, * This can happen for, e.g., VM_NONLINEAR regions before * a page has been allocated and mapped at a given offset, * or for addresses that map beyond end of a file. - * We'll mlock the the pages if/when they get faulted in. + * We'll mlock the pages if/when they get faulted in. */ if (ret < 0) break; - if (ret == 0) { - /* - * We know the vma is there, so the only time - * we cannot get a single page should be an - * error (ret < 0) case. - */ - WARN_ON(1); - break; - } lru_add_drain(); /* push cached pages to LRU */ for (i = 0; i < ret; i++) { struct page *page = pages[i]; - lock_page(page); - /* - * Because we lock page here and migration is blocked - * by the elevated reference, we need only check for - * page truncation (file-cache only). - */ if (page->mapping) { - if (mlock) + /* + * That preliminary check is mainly to avoid + * the pointless overhead of lock_page on the + * ZERO_PAGE: which might bounce very badly if + * there is contention. However, we're still + * dirtying its cacheline with get/put_page: + * we'll add another __get_user_pages flag to + * avoid it if that case turns out to matter. + */ + lock_page(page); + /* + * Because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) mlock_vma_page(page); - else - munlock_vma_page(page); + unlock_page(page); } - unlock_page(page); - put_page(page); /* ref from get_user_pages() */ - - /* - * here we assume that get_user_pages() has given us - * a list of virtually contiguous pages. - */ - addr += PAGE_SIZE; /* for next get_user_pages() */ - nr_pages--; + put_page(page); /* ref from get_user_pages() */ } + + addr += ret * PAGE_SIZE; + nr_pages -= ret; ret = 0; } - return ret; /* count entire vma as locked_vm */ + return ret; /* 0 or negative error code */ } /* @@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end, 1); + __mlock_vma_pages_range(vma, start, end); /* Hide errors from mmap() and other callers */ return 0; @@ -310,7 +291,6 @@ no_mlock: return nr_pages; /* error or pages NOT mlocked */ } - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -330,10 +310,38 @@ no_mlock: * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end) { + unsigned long addr; + + lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - __mlock_vma_pages_range(vma, start, end, 0); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page; + /* + * Although FOLL_DUMP is intended for get_dump_page(), + * it just so happens that its special treatment of the + * ZERO_PAGE (returning an error instead of doing get_page) + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (page && !IS_ERR(page)) { + lock_page(page); + /* + * Like in __mlock_vma_pages_range(), + * because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } + cond_resched(); + } } /* @@ -400,18 +408,14 @@ success: * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - vma->vm_flags = newflags; if (lock) { - ret = __mlock_vma_pages_range(vma, start, end, 1); - - if (ret > 0) { - mm->locked_vm -= ret; - ret = 0; - } else - ret = __mlock_posix_error_return(ret); /* translate if needed */ + vma->vm_flags = newflags; + ret = __mlock_vma_pages_range(vma, start, end); + if (ret < 0) + ret = __mlock_posix_error_return(ret); } else { - __mlock_vma_pages_range(vma, start, end, 0); + munlock_vma_pages_range(vma, start, end); } out: diff --git a/mm/mmap.c b/mm/mmap.c index 376492ed08f..21d4029a07b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -570,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end); /* * When changing only vma->vm_end, we don't really need - * anon_vma lock: but is that case worth optimizing out? + * anon_vma lock. */ - if (vma->anon_vma) + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) anon_vma = vma->anon_vma; if (anon_vma) { spin_lock(&anon_vma->lock); @@ -656,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } -/* Flags that can be inherited from an existing mapping when merging */ -#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -666,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) + /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ + if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) return 0; if (vma->vm_file != file) return 0; @@ -951,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + if (file) + return -EINVAL; + + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -965,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (flags & MAP_LOCKED) { + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; - vm_flags |= VM_LOCKED; - } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { @@ -1195,21 +1209,21 @@ munmap_back: goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + pgoff = vma->vm_pgoff; + vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - addr = vma->vm_start; - pgoff = vma->vm_pgoff; - vm_flags = vma->vm_flags; - if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); @@ -2111,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); diff --git a/mm/mmu_context.c b/mm/mmu_context.c new file mode 100644 index 00000000000..ded9081f402 --- /dev/null +++ b/mm/mmu_context.c @@ -0,0 +1,58 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * + * See ../COPYING for licensing terms. + */ + +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/sched.h> + +#include <asm/mmu_context.h> + +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * Called by the retry thread execute retries within the + * iocb issuer's mm context, so that copy_from/to_user + * operations work seamlessly for aio. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + atomic_inc(&mm->mm_count); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); + + if (active_mm != mm) + mmdrop(active_mm); +} + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5f4ef0250be..7e33f2cb3c7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, + pte_t pte) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->change_pte) + mn->ops->change_pte(mn, mm, address, pte); + /* + * Some drivers don't have change_pte, + * so we must call invalidate_page in that case. + */ + else if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b91be4..20a07dba6be 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -11,6 +11,7 @@ #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/shm.h> +#include <linux/ksm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/capability.h> @@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; + int err; /* * We'd prefer to avoid failure later on in do_munmap: @@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * Advise KSM to break any KSM pages in the area to be moved: + * it would be confusing if they were to turn up at the new + * location, where they happen to coincide with different KSM + * pages recently unmapped. But leave vma->vm_flags as it was, + * so KSM can come around to merge on vma and new_vma afterwards. + */ + err = ksm_madvise(vma, old_addr, old_addr + old_len, + MADV_UNMERGEABLE, &vm_flags); + if (err) + return err; + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) diff --git a/mm/nommu.c b/mm/nommu.c index 66e81e7e9fe..1a4473faac4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,6 +33,7 @@ #include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include "internal.h" static inline __attribute__((format(printf, 1, 2))) @@ -56,8 +57,6 @@ void no_printk(const char *fmt, ...) no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif -#include "internal.h" - void *high_memory; struct page *mem_map; unsigned long max_mapnr; @@ -170,21 +169,20 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int flags, + unsigned long start, int nr_pages, int foll_flags, struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. - * - if 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); @@ -192,8 +190,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto finish_or_fault; /* protect what we can, including chardevs */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - (!ignore && !(vm_flags & vma->vm_flags))) + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) goto finish_or_fault; if (pages) { @@ -212,7 +210,6 @@ finish_or_fault: return i ? : -EFAULT; } - /* * get a list of pages in an address range belonging to the specified process * and indicate the VMA that covers each page @@ -227,9 +224,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int flags = 0; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } @@ -627,6 +624,22 @@ static void put_nommu_region(struct vm_region *region) } /* + * update protection on a vma + */ +static void protect_vma(struct vm_area_struct *vma, unsigned long flags) +{ +#ifdef CONFIG_MPU + struct mm_struct *mm = vma->vm_mm; + long start = vma->vm_start & PAGE_MASK; + while (start < vma->vm_end) { + protect_page(mm, start, flags); + start += PAGE_SIZE; + } + update_protections(mm); +#endif +} + +/* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous * page @@ -645,6 +658,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; vma->vm_mm = mm; + protect_vma(vma, vma->vm_flags); + /* add the VMA to the mapping */ if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -707,6 +722,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) kenter("%p", vma); + protect_vma(vma, 0); + mm->map_count--; if (mm->mmap_cache == vma) mm->mmap_cache = NULL; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922..ea2147dabba 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks; static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ +/* + * Is all threads of the target process nodes overlap ours? + */ +static int has_intersects_mems_allowed(struct task_struct *tsk) +{ + struct task_struct *t; + + t = tsk; + do { + if (cpuset_mems_allowed_intersects(current, t)) + return 1; + t = next_thread(t); + } while (t != tsk); + + return 0; +} + /** * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate @@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj = p->signal->oom_adj; + struct task_cputime task_time; + unsigned long utime; + unsigned long stime; + + if (oom_adj == OOM_DISABLE) + return 0; task_lock(p); mm = p->mm; @@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) /* * swapoff can easily use up all memory, so kill those first. */ - if (p->flags & PF_SWAPOFF) + if (p->flags & PF_OOM_ORIGIN) return ULONG_MAX; /* @@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * of seconds. There is no particular reason for this other than * that it turned out to work very well in practice. */ - cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) - >> (SHIFT_HZ + 3); + thread_group_cputime(p, &task_time); + utime = cputime_to_jiffies(task_time.utime); + stime = cputime_to_jiffies(task_time.stime); + cpu_time = (utime + stime) >> (SHIFT_HZ + 3); + if (uptime >= p->start_time.tv_sec) run_time = (uptime - p->start_time.tv_sec) >> 10; @@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * because p may have allocated or otherwise mapped memory on * this node before. However it will be less likely. */ - if (!cpuset_mems_allowed_intersects(current, p)) + if (!has_intersects_mems_allowed(p)) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, static struct task_struct *select_bad_process(unsigned long *ppoints, struct mem_cgroup *mem) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; struct timespec uptime; *ppoints = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + for_each_process(p) { unsigned long points; /* @@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) + if (p->signal->oom_adj == OOM_DISABLE) continue; points = badness(p, uptime.tv_sec); @@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, chosen = p; *ppoints = points; } - } while_each_thread(g, p); + } return chosen; } @@ -304,7 +331,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); @@ -346,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose) static int oom_kill_task(struct task_struct *p) { - struct mm_struct *mm; - struct task_struct *g, *q; - - mm = p->mm; - /* WARNING: mm may not be dereferenced since we did not obtain its * value from get_task_mm(p). This is OK since all we need to do is * compare mm to q->mm below. @@ -359,30 +381,11 @@ static int oom_kill_task(struct task_struct *p) * change to NULL at any time since we do not hold task_lock(p). * However, this is of no concern to us. */ - - if (mm == NULL) + if (!p->mm || p->signal->oom_adj == OOM_DISABLE) return 1; - /* - * Don't kill the process if any threads are set to OOM_DISABLE - */ - do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) - return 1; - } while_each_thread(g, q); - __oom_kill_task(p, 1); - /* - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group. Don't let them have access - * to memory reserves though, otherwise we might deplete all memory. - */ - do_each_thread(g, q) { - if (q->mm == mm && !same_thread_group(q, p)) - force_sig(SIGKILL, q); - } while_each_thread(g, q); - return 0; } @@ -394,8 +397,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, if (printk_ratelimit()) { printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->signal->oom_adj); task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1ba4644105..5f378dd5880 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -380,7 +380,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -404,7 +405,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0de15f4698..5717f27a070 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <trace/events/kmem.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = { int min_free_kbytes = 1024; -unsigned long __meminitdata nr_kernel_pages; -unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata nr_kernel_pages; +static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP @@ -510,7 +510,7 @@ static inline int free_pages_check(struct page *page) } /* - * Frees a list of pages. + * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * @@ -520,22 +520,42 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static void free_pages_bulk(struct zone *zone, int count, - struct list_head *list, int order) +static void free_pcppages_bulk(struct zone *zone, int count, + struct per_cpu_pages *pcp) { + int migratetype = 0; + int batch_free = 0; + spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); - while (count--) { + __mod_zone_page_state(zone, NR_FREE_PAGES, count); + while (count) { struct page *page; + struct list_head *list; - VM_BUG_ON(list_empty(list)); - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_one_page list manipulates */ - list_del(&page->lru); - __free_one_page(page, zone, order, page_private(page)); + /* + * Remove pages from lists in a round-robin fashion. A + * batch_free count is maintained that is incremented when an + * empty list is encountered. This is so more pages are freed + * off fuller lists instead of spinning excessively around empty + * lists + */ + do { + batch_free++; + if (++migratetype == MIGRATE_PCPTYPES) + migratetype = 0; + list = &pcp->lists[migratetype]; + } while (list_empty(list)); + + do { + page = list_entry(list->prev, struct page, lru); + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, 0, migratetype); + trace_mm_page_pcpu_drain(page, 0, migratetype); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } @@ -557,7 +577,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int wasMlocked = TestClearPageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -783,6 +803,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_page, end_page, migratetype); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -836,11 +867,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - if (current_order == pageblock_order) - set_pageblock_migratetype(page, + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) + change_pageblock_range(page, current_order, start_migratetype); expand(zone, page, order, current_order, area, migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype); + return page; } } @@ -874,6 +910,7 @@ retry_reserve: } } + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -934,7 +971,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); + free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; local_irq_restore(flags); } @@ -960,7 +997,7 @@ static void drain_pages(unsigned int cpu) pcp = &pset->pcp; local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); } @@ -1026,7 +1063,8 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int wasMlocked = TestClearPageMlocked(page); + int migratetype; + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1043,35 +1081,49 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; - set_page_private(page, get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_page_private(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Free ISOLATE pages back to the allocator because they are being + * offlined but treat RESERVE as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(migratetype == MIGRATE_ISOLATE)) { + free_one_page(zone, page, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + if (cold) - list_add_tail(&page->lru, &pcp->list); + list_add_tail(&page->lru, &pcp->lists[migratetype]); else - list_add(&page->lru, &pcp->list); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->batch, pcp); pcp->count -= pcp->batch; } + +out: local_irq_restore(flags); put_cpu(); } void free_hot_page(struct page *page) { + trace_mm_page_free_direct(page, 0); free_hot_cold_page(page, 0); } -void free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] @@ -1119,35 +1171,23 @@ again: cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; + struct list_head *list; pcp = &zone_pcp(zone, cpu)->pcp; + list = &pcp->lists[migratetype]; local_irq_save(flags); - if (!pcp->count) { - pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, migratetype, cold); - if (unlikely(!pcp->count)) + if (unlikely(list_empty(list))) goto failed; } - /* Find a page of the appropriate migrate type */ - if (cold) { - list_for_each_entry_reverse(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } else { - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } - - /* Allocate more to the pcp list if necessary */ - if (unlikely(&page->lru == &pcp->list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, - migratetype, cold); - page = list_entry(pcp->list.next, struct page, lru); - } + if (cold) + page = list_entry(list->prev, struct page, lru); + else + page = list_entry(list->next, struct page, lru); list_del(&page->lru); pcp->count--; @@ -1627,10 +1667,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - - /* - * The task's cpuset might have expanded its set of allowable nodes - */ p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; @@ -1765,6 +1801,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, wake_all_kswapd(order, zonelist, high_zoneidx); +restart: /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according @@ -1772,7 +1809,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); -restart: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -1907,6 +1943,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -1916,44 +1953,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { - struct page * page; + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } - EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { + trace_mm_pagevec_free(pvec->pages[i], pvec->cold); free_hot_cold_page(pvec->pages[i], pvec->cold); + } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { + trace_mm_page_free_direct(page, order); if (order == 0) free_hot_page(page); else @@ -2128,23 +2162,28 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" - " inactive_file:%lu" + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" + " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), + global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), + nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); @@ -2162,7 +2201,21 @@ void show_free_areas(void) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" " present:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " shmem:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", @@ -2176,7 +2229,22 @@ void show_free_areas(void) K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") ); @@ -2783,7 +2851,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) { unsigned long start_pfn, pfn, end_pfn; struct page *page; - unsigned long reserve, block_migratetype; + unsigned long block_migratetype; + int reserve; /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; @@ -2791,6 +2860,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; + /* + * Reserve blocks are generally in place to help high-order atomic + * allocations that are short-lived. A min_free_kbytes value that + * would result in more than 2 reserve blocks for atomic allocations + * is assumed to be in place to help anti-fragmentation for the + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) continue; @@ -2961,6 +3039,7 @@ static int zone_batchsize(struct zone *zone) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; + int migratetype; memset(p, 0, sizeof(*p)); @@ -2968,7 +3047,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&pcp->lists[migratetype]); } /* @@ -3146,6 +3226,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } +static int __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = zone_pcp(zone, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} + static __meminit void zone_pcp_init(struct zone *zone) { int cpu; @@ -3720,7 +3826,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_saved_scan = 0; + zone->reclaim_stat.nr_saved_scan[l] = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -4509,7 +4615,7 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } -/** +/* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4732,7 +4838,14 @@ void *__init alloc_large_system_hash(const char *tablename, numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -4874,13 +4987,16 @@ int set_migratetype_isolate(struct page *page) struct zone *zone; unsigned long flags; int ret = -EBUSY; + int zone_idx; zone = page_zone(page); + zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); /* * In future, more migrate types will be able to be isolation target. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && + zone_idx != ZONE_MOVABLE) goto out; set_pageblock_migratetype(page, MIGRATE_ISOLATE); move_freepages_block(zone, page, MIGRATE_ISOLATE); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4ebbd8d..3d535d59482 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; VM_BUG_ON(!slab_is_available()); - base = kmalloc_node(table_size, + if (node_state(nid, N_HIGH_MEMORY)) { + base = kmalloc_node(table_size, GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); + if (!base) + base = vmalloc(table_size); + } } else { /* * We don't have to allocate page_cgroup again, but diff --git a/mm/rmap.c b/mm/rmap.c index 0895b5c7cbf..720fc03a7bc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -710,27 +710,6 @@ void page_add_file_rmap(struct page *page) } } -#ifdef CONFIG_DEBUG_VM -/** - * page_dup_rmap - duplicate pte mapping to a page - * @page: the page to add the mapping to - * @vma: the vm area being duplicated - * @address: the user virtual address mapped - * - * For copy_page_range only: minimal extract from page_add_file_rmap / - * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's - * quicker. - * - * The caller needs to hold the pte lock. - */ -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) -{ - if (PageAnon(page)) - __page_check_anon_rmap(page, vma, address); - atomic_inc(&page->_mapcount); -} -#endif - /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -739,34 +718,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long */ void page_remove_rmap(struct page *page) { - if (atomic_add_negative(-1, &page->_mapcount)) { - /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - */ - if ((!PageAnon(page) || PageSwapCache(page)) && - page_test_dirty(page)) { - page_clear_dirty(page); - set_page_dirty(page); - } - if (PageAnon(page)) - mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, -1); - /* - * It would be tidy to reset the PageAnon mapping here, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, - * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. - */ + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + return; + + /* + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. + */ + if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { + page_clear_dirty(page); + set_page_dirty(page); } + if (PageAnon(page)) { + mem_cgroup_uncharge_page(page); + __dec_zone_page_state(page, NR_ANON_PAGES); + } else { + __dec_zone_page_state(page, NR_FILE_MAPPED); + } + mem_cgroup_update_mapped_file_stat(page, -1); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ } /* diff --git a/mm/shmem.c b/mm/shmem.c index bd20f8bb02a..b206a7a32e2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt; #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/security.h> #include <linux/swapops.h> @@ -1097,6 +1096,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(swap, NULL); redirty: set_page_dirty(page); @@ -2306,17 +2309,14 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ - sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), + sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return -ENOMEM; - sbinfo->max_blocks = 0; - sbinfo->max_inodes = 0; sbinfo->mode = S_IRWXUGO | S_ISVTX; sbinfo->uid = current_fsuid(); sbinfo->gid = current_fsgid(); - sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -2590,6 +2590,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + return 0; +} + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) diff --git a/mm/slab.c b/mm/slab.c index 7b5d4deacfc..7dfa481c96b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/slub.c b/mm/slub.c index 0a216aae227..4996fc71955 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3345,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, { struct kmem_cache *s; + if (WARN_ON(!name)) + return NULL; + down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a13ea6401ae..d9714bdcb4a 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { - struct page *page = alloc_pages_node(node, + struct page *page; + + if (node_state(node, N_HIGH_MEMORY)) + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, get_order(size)); + else + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(size)); if (page) return page_address(page); return NULL; diff --git a/mm/sparse.c b/mm/sparse.c index da432d9f0ae..6ce4aab69e9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) - section = kmalloc_node(array_size, GFP_KERNEL, nid); - else + if (slab_is_available()) { + if (node_state(nid, N_HIGH_MEMORY)) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = kmalloc(array_size, GFP_KERNEL); + } else section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) diff --git a/mm/swap.c b/mm/swap.c index cb29ae5d33a..308e57d8d7e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_is_file_cache(page); + int lru = page_lru_base_type(page); list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } @@ -181,7 +181,7 @@ void activate_page(struct page *page) spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); - int lru = LRU_BASE + file; + int lru = page_lru_base_type(page); del_page_from_lru_list(zone, page, lru); SetPageActive(page); @@ -189,7 +189,7 @@ void activate_page(struct page *page) add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(zone, page, !!file, 1); + update_page_reclaim_stat(zone, page, file, 1); } spin_unlock_irq(&zone->lru_lock); } @@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP bdi_init(swapper_space.backing_dev_info); diff --git a/mm/swap_state.c b/mm/swap_state.c index 5ae6b8b78c8..6d1daeb1cb4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -67,10 +67,10 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -78,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) VM_BUG_ON(PageSwapCache(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (likely(!error)) { + total_swapcache_pages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + } + spin_unlock_irq(&swapper_space.tree_lock); + + if (unlikely(error)) { + /* + * Only the context which have set SWAP_HAS_CACHE flag + * would call add_to_swap_cache(). + * So add_to_swap_cache() doesn't returns -EEXIST. + */ + VM_BUG_ON(error == -EEXIST); + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } + + return error; +} + + +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +{ + int error; + error = radix_tree_preload(gfp_mask); if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); - - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, - entry.val, page); - if (likely(!error)) { - total_swapcache_pages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); - } - spin_unlock_irq(&swapper_space.tree_lock); + error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); - - if (unlikely(error)) { - set_page_private(page, 0UL); - ClearPageSwapCache(page); - page_cache_release(page); - } } return error; } @@ -137,38 +152,34 @@ int add_to_swap(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageUptodate(page)); - for (;;) { - entry = get_swap_page(); - if (!entry.val) - return 0; + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Radix-tree node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache and mark it dirty + */ + err = add_to_swap_cache(page, entry, + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + + if (!err) { /* Success */ + SetPageDirty(page); + return 1; + } else { /* -ENOMEM radix-tree allocation failure */ /* - * Radix-tree node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - /* - * Add it to the swap cache and mark it dirty + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. */ - err = add_to_swap_cache(page, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - switch (err) { - case 0: /* Success */ - SetPageDirty(page); - return 1; - case -EEXIST: - /* Raced with "speculative" read_swap_cache_async */ - swapcache_free(entry, NULL); - continue; - default: - /* -ENOMEM radix-tree allocation failure */ - swapcache_free(entry, NULL); - return 0; - } + swapcache_free(entry, NULL); + return 0; } } @@ -290,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(gfp_mask & GFP_KERNEL); + if (err) + break; + + /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) /* seems racy */ + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); continue; - if (err) /* swp entry is obsolete ? */ + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); break; + } - /* - * Associate the page with swap entry in the swap cache. - * May fail (-EEXIST) if there is already a page associated - * with this entry in the swap cache: added by a racing - * read_swap_cache_async, or add_to_swap or shmem_writepage - * re-using the just freed swap entry for an existing page. - * May fail (-ENOMEM) if radix-tree node allocation failed. - */ + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); + err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { + radix_tree_preload_end(); /* * Initiate read into locked page and return. */ @@ -317,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, swap_readpage(new_page); return new_page; } + radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(entry, NULL); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index 74f1102e874..f1bf19daadc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1575,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - current->flags |= PF_SWAPOFF; + current->flags |= PF_OOM_ORIGIN; err = try_to_unuse(type); - current->flags &= ~PF_SWAPOFF; + current->flags &= ~PF_OOM_ORIGIN; if (err) { /* re-insert swap space back into swap_list */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 204b8243d8a..5535da1d696 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -25,7 +25,7 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> - +#include <linux/highmem.h> #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> @@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); - if (unlikely(err)) - return err; return nr; } @@ -1272,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; - - vmap_debug_free_range(va->va_start, va->va_end); - free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; - + /* + * remove from list and disallow access to this vm_struct + * before unmap. (address range confliction is maintained by + * vmap.) + */ write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) ; *p = tmp->next; write_unlock(&vmlist_lock); + vmap_debug_free_range(va->va_start, va->va_end); + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + return vm; } return NULL; @@ -1384,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > num_physpages) + if (count > totalram_pages) return NULL; area = get_vm_area_caller((count << PAGE_SHIFT), flags, @@ -1491,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, @@ -1641,10 +1643,120 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user); +/* + * small helper routine , copy contents to buf from addr. + * If the page is not present, fill zero. + */ + +static int aligned_vread(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(buf, map + offset, length); + kunmap_atomic(map, KM_USER0); + } else + memset(buf, 0, length); + + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +static int aligned_vwrite(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(map + offset, buf, length); + kunmap_atomic(map, KM_USER0); + } + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +/** + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be increased. + * (same number to @count). Returns 0 if [addr...addr+count) doesn't + * includes any intersect with alive vmalloc area. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + */ + long vread(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; char *vaddr, *buf_start = buf; + unsigned long buflen = count; unsigned long n; /* Don't allow overflow */ @@ -1652,7 +1764,7 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1665,32 +1777,72 @@ long vread(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) + aligned_vread(buf, addr, n); + else /* IOREMAP area is treated as memory hole */ + memset(buf, 0, n); + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + + if (buf == buf_start) + return 0; + /* zero-fill memory holes */ + if (buf != buf_start + buflen) + memset(buf, 0, buflen - (buf - buf_start)); + + return buflen; } +/** + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be incresed. + * (same number to @count). + * If [addr...addr+count) doesn't includes any intersect with valid + * vmalloc area, returns 0. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * The caller should guarantee KM_USER1 is not used. + */ + long vwrite(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; + char *vaddr; + unsigned long n, buflen; + int copied = 0; /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; + buflen = count; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1702,18 +1854,21 @@ long vwrite(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) { + aligned_vwrite(buf, addr, n); + copied++; + } + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + if (!copied) + return 0; + return buflen; } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index ba8228e0a80..613e89f471d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, return &zone->reclaim_stat; } -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, - enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct zone *zone, + struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); @@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!page_has_private(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -531,7 +535,7 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_is_file_cache(page); + lru = active + page_lru_base_type(page); lru_cache_add_lru(page, lru); } else { /* @@ -821,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; - if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) return ret; /* @@ -935,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; + + /* + * If we don't have enough swap space, reclaiming of + * anon page which don't already have a swap slot is + * pointless. + */ + if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + !PageSwapCache(cursor_page)) + continue; + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); @@ -961,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr, if (file) lru += LRU_FILE; return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, !!file); + mode, file); } /* @@ -976,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { - lru = page_is_file_cache(page); + lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); @@ -1034,6 +1048,31 @@ int isolate_lru_page(struct page *page) } /* + * Are there way too many processes in the direct reclaim path already? + */ +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!scanning_global_lru(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + return isolated > inactive; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1048,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int lumpy_reclaim = 0; + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(WRITE, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + /* * If we need a large contiguous chunk of memory, or have * trouble getting a small set of contiguous pages, we @@ -1072,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + unsigned long nr_anon; + unsigned long nr_file; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); + + if (scanning_global_lru(sc)) { + zone->pages_scanned += nr_scan; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scan); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scan); + } + + if (nr_taken == 0) + goto done; + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1088,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_INACTIVE_ANON, -count[LRU_INACTIVE_ANON]); - if (scanning_global_lru(sc)) - zone->pages_scanned += nr_scan; + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; @@ -1123,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } nr_reclaimed += nr_freed; + local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scanning_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); - if (nr_taken == 0) - goto done; - spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. @@ -1153,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); - if (PageActive(page)) { - int file = !!page_is_file_cache(page); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); reclaim_stat->recent_rotated[file]++; } if (!pagevec_add(&pvec, page)) { @@ -1163,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_lock_irq(&zone->lru_lock); } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); + done: - local_irq_enable(); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); return nr_reclaimed; } @@ -1215,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone, while (!list_empty(list)) { page = lru_to_page(list); - prefetchw_prev_lru_page(page, list, flags); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - if (!is_active_lru(lru)) - ClearPageActive(page); /* we are de-activating */ - list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); pgmoved++; @@ -1244,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; + unsigned long nr_taken; unsigned long pgscanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ @@ -1252,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_rotated = 0; lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, sc->mem_cgroup, 1, file); /* @@ -1265,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, if (scanning_global_lru(sc)) { zone->pages_scanned += pgscanned; } - reclaim_stat->recent_scanned[!!file] += pgmoved; + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1288,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - pgmoved++; + nr_rotated++; /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1304,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } @@ -1317,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * helps balance scan pressure between file and anonymous pages in * get_scan_ratio. */ - reclaim_stat->recent_rotated[!!file] += pgmoved; + reclaim_stat->recent_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); move_active_pages_to_lru(zone, &l_inactive, LRU_BASE + file * LRU_FILE); - + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); } @@ -1429,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); @@ -1526,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1540,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone, int file = is_file_lru(l); unsigned long scan; - scan = zone_nr_pages(zone, sc, l); + scan = zone_nr_lru_pages(zone, sc, l); if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) - nr[l] = nr_scan_try_batch(scan, - &zone->lru[l].nr_saved_scan, - swap_cluster_max); - else - nr[l] = scan; + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l], + swap_cluster_max); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1685,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } } @@ -1902,7 +1959,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } /* @@ -1946,7 +2003,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_lru_pages(zone) * 6)) + (zone_reclaimable_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -2113,12 +2170,39 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } -unsigned long global_lru_pages(void) +/* + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable + */ +unsigned long global_reclaimable_pages(void) +{ + int nr; + + nr = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON); + + return nr; +} + +unsigned long zone_reclaimable_pages(struct zone *zone) { - return global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_INACTIVE_FILE); + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } #ifdef CONFIG_HIBERNATION @@ -2133,6 +2217,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, { struct zone *zone; unsigned long nr_reclaimed = 0; + struct zone_reclaim_stat *reclaim_stat; for_each_populated_zone(zone) { enum lru_list l; @@ -2149,11 +2234,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { + reclaim_stat = get_reclaim_stat(zone, sc); + reclaim_stat->nr_saved_scan[l] += + (lru_pages >> prio) + 1; + if (reclaim_stat->nr_saved_scan[l] + >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_saved_scan = 0; + reclaim_stat->nr_saved_scan[l] = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); @@ -2190,7 +2278,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = global_lru_pages(); + lru_pages = global_reclaimable_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -2232,7 +2320,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); + global_reclaimable_pages()); sc.nr_reclaimed += reclaim_state.reclaimed_slab; if (sc.nr_reclaimed >= nr_pages) goto out; @@ -2249,7 +2337,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, + global_reclaimable_pages()); sc.nr_reclaimed += reclaim_state.reclaimed_slab; } while (sc.nr_reclaimed < nr_pages && reclaim_state.reclaimed_slab > 0); @@ -2569,7 +2658,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { - enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); diff --git a/mm/vmstat.c b/mm/vmstat.c index 138bed53706..c81321f9fee 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -639,11 +639,14 @@ static const char * const vmstat_text[] = { "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", + "nr_kernel_stack", "nr_unstable", "nr_bounce", "nr_vmscan_write", "nr_writeback_temp", - + "nr_isolated_anon", + "nr_isolated_file", + "nr_shmem", #ifdef CONFIG_NUMA "numa_hit", "numa_miss", diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 09bedeb5579..49d8495d69b 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -577,11 +577,6 @@ static int hidp_session(void *arg) } if (session->hid) { - if (session->hid->claimed & HID_CLAIMED_INPUT) - hidinput_disconnect(session->hid); - if (session->hid->claimed & HID_CLAIMED_HIDRAW) - hidraw_disconnect(session->hid); - hid_destroy_device(session->hid); session->hid = NULL; } @@ -747,8 +742,6 @@ static void hidp_stop(struct hid_device *hid) skb_queue_purge(&session->ctrl_transmit); skb_queue_purge(&session->intr_transmit); - if (hid->claimed & HID_CLAIMED_INPUT) - hidinput_disconnect(hid); hid->claimed = 0; } diff --git a/net/core/sock.c b/net/core/sock.c index 30d5446512f..524712a7b15 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1206,12 +1206,12 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); void __init sk_init(void) { - if (num_physpages <= 4096) { + if (totalram_pages <= 4096) { sysctl_wmem_max = 32767; sysctl_rmem_max = 32767; sysctl_wmem_default = 32767; sysctl_rmem_default = 32767; - } else if (num_physpages >= 131072) { + } else if (totalram_pages >= 131072) { sysctl_wmem_max = 131071; sysctl_rmem_max = 131071; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 923db06c7e5..bc4467082a0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1049,10 +1049,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (21 - PAGE_SHIFT); + if (totalram_pages >= (128 * 1024)) + goal = totalram_pages >> (21 - PAGE_SHIFT); else - goal = num_physpages >> (23 - PAGE_SHIFT); + goal = totalram_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 9383d3e5a1a..57662cabaf9 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1750,7 +1750,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = num_physpages >> (26 - PAGE_SHIFT); + goal = totalram_pages >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 91867d3e632..df934731453 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3414,7 +3414,7 @@ int __init ip_rt_init(void) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), rhash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 15 : 17, 0, &rt_hash_log, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 19a0612b8a2..21387ebabf0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2862,7 +2862,7 @@ void __init tcp_init(void) alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 13 : 15, 0, &tcp_hashinfo.ehash_size, @@ -2879,7 +2879,7 @@ void __init tcp_init(void) alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_size, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 13 : 15, 0, &tcp_hashinfo.bhash_size, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b37109817a9..7c9ec3dee96 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1245,9 +1245,9 @@ static int nf_conntrack_init_init_net(void) * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ if (!nf_conntrack_htable_size) { nf_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) + = (((totalram_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a6ac83a9334..f01955cce31 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -617,7 +617,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) int cpu; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 219dcdbe388..dd16e404424 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -194,9 +194,9 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) if (minfo->cfg.size) size = minfo->cfg.size; else { - size = ((num_physpages << PAGE_SHIFT) / 16384) / + size = ((totalram_pages << PAGE_SHIFT) / 16384) / sizeof(struct list_head); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) size = 8192; if (size < 16) size = 16; @@ -266,9 +266,9 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) if (minfo->cfg.size) { size = minfo->cfg.size; } else { - size = (num_physpages << PAGE_SHIFT) / 16384 / + size = (totalram_pages << PAGE_SHIFT) / 16384 / sizeof(struct list_head); - if (num_physpages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c5aab6a368c..55180b99562 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2091,10 +2091,10 @@ static int __init netlink_proto_init(void) if (!nl_table) goto panic; - if (num_physpages >= (128 * 1024)) - limit = num_physpages >> (21 - PAGE_SHIFT); + if (totalram_pages >= (128 * 1024)) + limit = totalram_pages >> (21 - PAGE_SHIFT); else - limit = num_physpages >> (23 - PAGE_SHIFT); + limit = totalram_pages >> (23 - PAGE_SHIFT); order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; limit = (1UL << order) / sizeof(struct hlist_head); diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index d9231245a79..bc0019f704f 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -96,7 +96,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) } /* - * allocate a new client call and attempt to to get a connection slot for it + * allocate a new client call and attempt to get a connection slot for it */ static struct rxrpc_call *rxrpc_alloc_client_call( struct rxrpc_sock *rx, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 375d64cb1a3..2c5c76be18f 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -77,7 +77,7 @@ * The service curve parameters are converted to the internal * representation. The slope values are scaled to avoid overflow. * the inverse slope values as well as the y-projection of the 1st - * segment are kept in order to to avoid 64-bit divide operations + * segment are kept in order to avoid 64-bit divide operations * that are expensive on 32-bit architectures. */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index c557f1fb1c6..612dc878e05 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1184,10 +1184,10 @@ SCTP_STATIC __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. */ - if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (22 - PAGE_SHIFT); + if (totalram_pages >= (128 * 1024)) + goal = totalram_pages >> (22 - PAGE_SHIFT); else - goal = num_physpages >> (24 - PAGE_SHIFT); + goal = totalram_pages >> (24 - PAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) ; diff --git a/net/socket.c b/net/socket.c index 2a022c00d85..0ad02ae61a9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -285,7 +285,7 @@ static int init_inodecache(void) return 0; } -static struct super_operations sockfs_ops = { +static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .destroy_inode =sock_destroy_inode, .statfs = simple_statfs, diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 0c431c277af..54a4e042f10 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -385,7 +385,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, EXPORT_SYMBOL_GPL(rpcauth_init_cred); void -rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) +rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) { task->tk_msg.rpc_cred = get_rpccred(cred); dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, @@ -394,7 +394,7 @@ rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred); static void -rpcauth_bind_root_cred(struct rpc_task *task) +rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { @@ -405,7 +405,7 @@ rpcauth_bind_root_cred(struct rpc_task *task) dprintk("RPC: %5u looking up %s cred\n", task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); - ret = auth->au_ops->lookup_cred(auth, &acred, 0); + ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); if (!IS_ERR(ret)) task->tk_msg.rpc_cred = ret; else @@ -413,14 +413,14 @@ rpcauth_bind_root_cred(struct rpc_task *task) } static void -rpcauth_bind_new_cred(struct rpc_task *task) +rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct rpc_cred *ret; dprintk("RPC: %5u looking up %s cred\n", task->tk_pid, auth->au_ops->au_name); - ret = rpcauth_lookupcred(auth, 0); + ret = rpcauth_lookupcred(auth, lookupflags); if (!IS_ERR(ret)) task->tk_msg.rpc_cred = ret; else @@ -430,12 +430,16 @@ rpcauth_bind_new_cred(struct rpc_task *task) void rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) { + int lookupflags = 0; + + if (flags & RPC_TASK_ASYNC) + lookupflags |= RPCAUTH_LOOKUP_NEW; if (cred != NULL) - cred->cr_ops->crbind(task, cred); + cred->cr_ops->crbind(task, cred, lookupflags); else if (flags & RPC_TASK_ROOTCREDS) - rpcauth_bind_root_cred(task); + rpcauth_bind_root_cred(task, lookupflags); else - rpcauth_bind_new_cred(task); + rpcauth_bind_new_cred(task, lookupflags); } void diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 4028502f052..bf88bf8e936 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -55,13 +55,13 @@ struct rpc_cred *rpc_lookup_machine_cred(void) EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred); static void -generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred) +generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; struct rpc_cred *ret; - ret = auth->au_ops->lookup_cred(auth, acred, 0); + ret = auth->au_ops->lookup_cred(auth, acred, lookupflags); if (!IS_ERR(ret)) task->tk_msg.rpc_cred = ret; else diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 2e6a148d277..f6c51e562a0 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1374,8 +1374,10 @@ svcauth_gss_release(struct svc_rqst *rqstp) if (stat) goto out_err; break; - default: - goto out_err; + /* + * For any other gc_svc value, svcauth_gss_accept() already set + * the auth_error appropriately; just fall through: + */ } out: diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 45cdaff9b36..d6eee291a0e 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -103,23 +103,21 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); -static void queue_loose(struct cache_detail *detail, struct cache_head *ch); +static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); -static int cache_fresh_locked(struct cache_head *head, time_t expiry) +static void cache_fresh_locked(struct cache_head *head, time_t expiry) { head->expiry_time = expiry; head->last_refresh = get_seconds(); - return !test_and_set_bit(CACHE_VALID, &head->flags); + set_bit(CACHE_VALID, &head->flags); } static void cache_fresh_unlocked(struct cache_head *head, - struct cache_detail *detail, int new) + struct cache_detail *detail) { - if (new) - cache_revisit_request(head); if (test_and_clear_bit(CACHE_PENDING, &head->flags)) { cache_revisit_request(head); - queue_loose(detail, head); + cache_dequeue(detail, head); } } @@ -132,7 +130,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, */ struct cache_head **head; struct cache_head *tmp; - int is_new; if (!test_bit(CACHE_VALID, &old->flags)) { write_lock(&detail->hash_lock); @@ -141,9 +138,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); - is_new = cache_fresh_locked(old, new->expiry_time); + cache_fresh_locked(old, new->expiry_time); write_unlock(&detail->hash_lock); - cache_fresh_unlocked(old, detail, is_new); + cache_fresh_unlocked(old, detail); return old; } write_unlock(&detail->hash_lock); @@ -167,11 +164,11 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, *head = tmp; detail->entries++; cache_get(tmp); - is_new = cache_fresh_locked(tmp, new->expiry_time); + cache_fresh_locked(tmp, new->expiry_time); cache_fresh_locked(old, 0); write_unlock(&detail->hash_lock); - cache_fresh_unlocked(tmp, detail, is_new); - cache_fresh_unlocked(old, detail, 0); + cache_fresh_unlocked(tmp, detail); + cache_fresh_unlocked(old, detail); cache_put(old, detail); return tmp; } @@ -184,6 +181,22 @@ static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h) return cd->cache_upcall(cd, h); } +static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) +{ + if (!test_bit(CACHE_VALID, &h->flags) || + h->expiry_time < get_seconds()) + return -EAGAIN; + else if (detail->flush_time > h->last_refresh) + return -EAGAIN; + else { + /* entry is valid */ + if (test_bit(CACHE_NEGATIVE, &h->flags)) + return -ENOENT; + else + return 0; + } +} + /* * This is the generic cache management routine for all * the authentication caches. @@ -192,8 +205,10 @@ static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h) * * * Returns 0 if the cache_head can be used, or cache_puts it and returns - * -EAGAIN if upcall is pending, - * -ETIMEDOUT if upcall failed and should be retried, + * -EAGAIN if upcall is pending and request has been queued + * -ETIMEDOUT if upcall failed or request could not be queue or + * upcall completed but item is still invalid (implying that + * the cache item has been replaced with a newer one). * -ENOENT if cache entry was negative */ int cache_check(struct cache_detail *detail, @@ -203,17 +218,7 @@ int cache_check(struct cache_detail *detail, long refresh_age, age; /* First decide return status as best we can */ - if (!test_bit(CACHE_VALID, &h->flags) || - h->expiry_time < get_seconds()) - rv = -EAGAIN; - else if (detail->flush_time > h->last_refresh) - rv = -EAGAIN; - else { - /* entry is valid */ - if (test_bit(CACHE_NEGATIVE, &h->flags)) - rv = -ENOENT; - else rv = 0; - } + rv = cache_is_valid(detail, h); /* now see if we want to start an upcall */ refresh_age = (h->expiry_time - h->last_refresh); @@ -229,10 +234,11 @@ int cache_check(struct cache_detail *detail, switch (cache_make_upcall(detail, h)) { case -EINVAL: clear_bit(CACHE_PENDING, &h->flags); + cache_revisit_request(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); - cache_fresh_unlocked(h, detail, - cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY)); + cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY); + cache_fresh_unlocked(h, detail); rv = -ENOENT; } break; @@ -245,10 +251,14 @@ int cache_check(struct cache_detail *detail, } } - if (rv == -EAGAIN) - if (cache_defer_req(rqstp, h) != 0) - rv = -ETIMEDOUT; - + if (rv == -EAGAIN) { + if (cache_defer_req(rqstp, h) < 0) { + /* Request is not deferred */ + rv = cache_is_valid(detail, h); + if (rv == -EAGAIN) + rv = -ETIMEDOUT; + } + } if (rv) cache_put(h, detail); return rv; @@ -396,7 +406,7 @@ static int cache_clean(void) ) continue; if (test_and_clear_bit(CACHE_PENDING, &ch->flags)) - queue_loose(current_detail, ch); + cache_dequeue(current_detail, ch); if (atomic_read(&ch->ref.refcount) == 1) break; @@ -412,8 +422,10 @@ static int cache_clean(void) if (!ch) current_index ++; spin_unlock(&cache_list_lock); - if (ch) + if (ch) { + cache_revisit_request(ch); cache_put(ch, d); + } } else spin_unlock(&cache_list_lock); @@ -488,7 +500,7 @@ static int cache_defer_cnt; static int cache_defer_req(struct cache_req *req, struct cache_head *item) { - struct cache_deferred_req *dreq; + struct cache_deferred_req *dreq, *discard; int hash = DFR_HASH(item); if (cache_defer_cnt >= DFR_MAX) { @@ -496,11 +508,11 @@ static int cache_defer_req(struct cache_req *req, struct cache_head *item) * or continue and drop the oldest below */ if (net_random()&1) - return -ETIMEDOUT; + return -ENOMEM; } dreq = req->defer(req); if (dreq == NULL) - return -ETIMEDOUT; + return -ENOMEM; dreq->item = item; @@ -513,23 +525,24 @@ static int cache_defer_req(struct cache_req *req, struct cache_head *item) list_add(&dreq->hash, &cache_defer_hash[hash]); /* it is in, now maybe clean up */ - dreq = NULL; + discard = NULL; if (++cache_defer_cnt > DFR_MAX) { - dreq = list_entry(cache_defer_list.prev, - struct cache_deferred_req, recent); - list_del(&dreq->recent); - list_del(&dreq->hash); + discard = list_entry(cache_defer_list.prev, + struct cache_deferred_req, recent); + list_del_init(&discard->recent); + list_del_init(&discard->hash); cache_defer_cnt--; } spin_unlock(&cache_defer_lock); - if (dreq) { + if (discard) /* there was one too many */ - dreq->revisit(dreq, 1); - } + discard->revisit(discard, 1); + if (!test_bit(CACHE_PENDING, &item->flags)) { /* must have just been validated... */ cache_revisit_request(item); + return -EAGAIN; } return 0; } @@ -551,7 +564,7 @@ static void cache_revisit_request(struct cache_head *item) dreq = list_entry(lp, struct cache_deferred_req, hash); lp = lp->next; if (dreq->item == item) { - list_del(&dreq->hash); + list_del_init(&dreq->hash); list_move(&dreq->recent, &pending); cache_defer_cnt--; } @@ -577,7 +590,7 @@ void cache_clean_deferred(void *owner) list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { if (dreq->owner == owner) { - list_del(&dreq->hash); + list_del_init(&dreq->hash); list_move(&dreq->recent, &pending); cache_defer_cnt--; } @@ -887,7 +900,7 @@ static int cache_release(struct inode *inode, struct file *filp, -static void queue_loose(struct cache_detail *detail, struct cache_head *ch) +static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { struct cache_queue *cq; spin_lock(&queue_lock); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fac0ca93f06..a417d5ab5dd 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -288,6 +288,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) .srcaddr = args->saddress, .dstaddr = args->address, .addrlen = args->addrsize, + .bc_xprt = args->bc_xprt, }; char servername[48]; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 7f676bdf70d..858a443f418 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -930,7 +930,7 @@ void rpc_remove_cache_dir(struct dentry *dentry) /* * populate the filesystem */ -static struct super_operations s_ops = { +static const struct super_operations s_ops = { .alloc_inode = rpc_alloc_inode, .destroy_inode = rpc_destroy_inode, .statfs = simple_statfs, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 8f459abe97c..cef74ba0666 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -21,6 +21,8 @@ #include <linux/sunrpc/clnt.h> +#include "sunrpc.h" + #ifdef RPC_DEBUG #define RPCDBG_FACILITY RPCDBG_SCHED #define RPC_TASK_MAGIC_ID 0xf00baa @@ -711,11 +713,6 @@ static void rpc_async_schedule(struct work_struct *work) __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); } -struct rpc_buffer { - size_t len; - char data[]; -}; - /** * rpc_malloc - allocate an RPC buffer * @task: RPC task that will use this buffer diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index 5d9dd742264..90c292e2738 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -27,11 +27,25 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _NET_SUNRPC_SUNRPC_H #define _NET_SUNRPC_SUNRPC_H +#include <linux/net.h> + +/* + * Header for dynamically allocated rpc buffers. + */ +struct rpc_buffer { + size_t len; + char data[]; +}; + static inline int rpc_reply_expected(struct rpc_task *task) { return (task->tk_msg.rpc_proc != NULL) && (task->tk_msg.rpc_proc->p_decode != NULL); } +int svc_send_common(struct socket *sock, struct xdr_buf *xdr, + struct page *headpage, unsigned long headoffset, + struct page *tailpage, unsigned long tailoffset); + #endif /* _NET_SUNRPC_SUNRPC_H */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 27d44332f01..df124f78ee4 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -160,6 +160,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); + rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); } EXPORT_SYMBOL_GPL(svc_xprt_init); @@ -710,10 +711,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) spin_unlock_bh(&pool->sp_lock); len = 0; - if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { - dprintk("svc_recv: found XPT_CLOSE\n"); - svc_delete_xprt(xprt); - } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; newxpt = xprt->xpt_ops->xpo_accept(xprt); if (newxpt) { @@ -739,7 +737,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_received(newxpt); } svc_xprt_received(xprt); - } else { + } else if (!test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, pool->sp_id, xprt, atomic_read(&xprt->xpt_ref.refcount)); @@ -752,6 +750,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) dprintk("svc: got len=%d\n", len); } + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); + svc_delete_xprt(xprt); + } + /* No data, incomplete (TCP) read, or accept() */ if (len == 0 || len == -EAGAIN) { rqstp->rq_res.len = 0; @@ -808,6 +811,7 @@ int svc_send(struct svc_rqst *rqstp) else len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); + rpc_wake_up(&xprt->xpt_bc_pending); svc_xprt_release(rqstp); if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) @@ -1166,11 +1170,6 @@ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); - lock_kernel(); - /* bump up the pseudo refcount while traversing */ - svc_get(serv); - unlock_kernel(); - if (!pidx) return SEQ_START_TOKEN; return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); @@ -1198,12 +1197,6 @@ static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) static void svc_pool_stats_stop(struct seq_file *m, void *p) { - struct svc_serv *serv = m->private; - - lock_kernel(); - /* this function really, really should have been called svc_put() */ - svc_destroy(serv); - unlock_kernel(); } static int svc_pool_stats_show(struct seq_file *m, void *p) diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 6caffa34ac0..117f68a8aa4 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -668,6 +668,7 @@ static int unix_gid_find(uid_t uid, struct group_info **gip, case 0: *gip = ug->gi; get_group_info(*gip); + cache_put(&ug->h, &unix_gid_cache); return 0; default: return -EAGAIN; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 23128ee191a..ccc5e83cae5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -49,6 +49,7 @@ #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/stats.h> +#include <linux/sunrpc/xprt.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -153,49 +154,27 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) } /* - * Generic sendto routine + * send routine intended to be shared by the fore- and back-channel */ -static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +int svc_send_common(struct socket *sock, struct xdr_buf *xdr, + struct page *headpage, unsigned long headoffset, + struct page *tailpage, unsigned long tailoffset) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct socket *sock = svsk->sk_sock; - int slen; - union { - struct cmsghdr hdr; - long all[SVC_PKTINFO_SPACE / sizeof(long)]; - } buffer; - struct cmsghdr *cmh = &buffer.hdr; - int len = 0; int result; int size; struct page **ppage = xdr->pages; size_t base = xdr->page_base; unsigned int pglen = xdr->page_len; unsigned int flags = MSG_MORE; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + int slen; + int len = 0; slen = xdr->len; - if (rqstp->rq_prot == IPPROTO_UDP) { - struct msghdr msg = { - .msg_name = &rqstp->rq_addr, - .msg_namelen = rqstp->rq_addrlen, - .msg_control = cmh, - .msg_controllen = sizeof(buffer), - .msg_flags = MSG_MORE, - }; - - svc_set_cmsg_data(rqstp, cmh); - - if (sock_sendmsg(sock, &msg, 0) < 0) - goto out; - } - /* send head */ if (slen == xdr->head[0].iov_len) flags = 0; - len = kernel_sendpage(sock, rqstp->rq_respages[0], 0, + len = kernel_sendpage(sock, headpage, headoffset, xdr->head[0].iov_len, flags); if (len != xdr->head[0].iov_len) goto out; @@ -219,16 +198,58 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) base = 0; ppage++; } + /* send tail */ if (xdr->tail[0].iov_len) { - result = kernel_sendpage(sock, rqstp->rq_respages[0], - ((unsigned long)xdr->tail[0].iov_base) - & (PAGE_SIZE-1), - xdr->tail[0].iov_len, 0); - + result = kernel_sendpage(sock, tailpage, tailoffset, + xdr->tail[0].iov_len, 0); if (result > 0) len += result; } + +out: + return len; +} + + +/* + * Generic sendto routine + */ +static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct socket *sock = svsk->sk_sock; + union { + struct cmsghdr hdr; + long all[SVC_PKTINFO_SPACE / sizeof(long)]; + } buffer; + struct cmsghdr *cmh = &buffer.hdr; + int len = 0; + unsigned long tailoff; + unsigned long headoff; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + + if (rqstp->rq_prot == IPPROTO_UDP) { + struct msghdr msg = { + .msg_name = &rqstp->rq_addr, + .msg_namelen = rqstp->rq_addrlen, + .msg_control = cmh, + .msg_controllen = sizeof(buffer), + .msg_flags = MSG_MORE, + }; + + svc_set_cmsg_data(rqstp, cmh); + + if (sock_sendmsg(sock, &msg, 0) < 0) + goto out; + } + + tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1); + headoff = 0; + len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff, + rqstp->rq_respages[0], tailoff); + out: dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, @@ -432,29 +453,49 @@ static void svc_tcp_write_space(struct sock *sk) } /* + * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo + */ +static int svc_udp_get_dest_address4(struct svc_rqst *rqstp, + struct cmsghdr *cmh) +{ + struct in_pktinfo *pki = CMSG_DATA(cmh); + if (cmh->cmsg_type != IP_PKTINFO) + return 0; + rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; + return 1; +} + +/* + * See net/ipv6/datagram.c : datagram_recv_ctl + */ +static int svc_udp_get_dest_address6(struct svc_rqst *rqstp, + struct cmsghdr *cmh) +{ + struct in6_pktinfo *pki = CMSG_DATA(cmh); + if (cmh->cmsg_type != IPV6_PKTINFO) + return 0; + ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr); + return 1; +} + +/* * Copy the UDP datagram's destination address to the rqstp structure. * The 'destination' address in this case is the address to which the * peer sent the datagram, i.e. our local address. For multihomed * hosts, this can change from msg to msg. Note that only the IP * address changes, the port number should remain the same. */ -static void svc_udp_get_dest_address(struct svc_rqst *rqstp, - struct cmsghdr *cmh) +static int svc_udp_get_dest_address(struct svc_rqst *rqstp, + struct cmsghdr *cmh) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - switch (svsk->sk_sk->sk_family) { - case AF_INET: { - struct in_pktinfo *pki = CMSG_DATA(cmh); - rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; - break; - } - case AF_INET6: { - struct in6_pktinfo *pki = CMSG_DATA(cmh); - ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr); - break; - } + switch (cmh->cmsg_level) { + case SOL_IP: + return svc_udp_get_dest_address4(rqstp, cmh); + case SOL_IPV6: + return svc_udp_get_dest_address6(rqstp, cmh); } + + return 0; } /* @@ -531,16 +572,15 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_prot = IPPROTO_UDP; - if (cmh->cmsg_level != IPPROTO_IP || - cmh->cmsg_type != IP_PKTINFO) { + if (!svc_udp_get_dest_address(rqstp, cmh)) { if (net_ratelimit()) - printk("rpcsvc: received unknown control message:" - "%d/%d\n", - cmh->cmsg_level, cmh->cmsg_type); + printk(KERN_WARNING + "svc: received unknown control message %d/%d; " + "dropping RPC reply datagram\n", + cmh->cmsg_level, cmh->cmsg_type); skb_free_datagram(svsk->sk_sk, skb); return 0; } - svc_udp_get_dest_address(rqstp, cmh); if (skb_is_nonlinear(skb)) { /* we have to copy */ @@ -651,8 +691,7 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int one = 1; - mm_segment_t oldfs; + int err, level, optname, one = 1; svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -671,12 +710,22 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); - oldfs = get_fs(); - set_fs(KERNEL_DS); /* make sure we get destination address info */ - svsk->sk_sock->ops->setsockopt(svsk->sk_sock, IPPROTO_IP, IP_PKTINFO, - (char __user *)&one, sizeof(one)); - set_fs(oldfs); + switch (svsk->sk_sk->sk_family) { + case AF_INET: + level = SOL_IP; + optname = IP_PKTINFO; + break; + case AF_INET6: + level = SOL_IPV6; + optname = IPV6_RECVPKTINFO; + break; + default: + BUG(); + } + err = kernel_setsockopt(svsk->sk_sock, level, optname, + (char *)&one, sizeof(one)); + dprintk("svc: kernel_setsockopt returned %d\n", err); } /* @@ -826,21 +875,15 @@ failed: } /* - * Receive data from a TCP socket. + * Receive data. + * If we haven't gotten the record length yet, get the next four bytes. + * Otherwise try to gobble up as much as possible up to the complete + * record length. */ -static int svc_tcp_recvfrom(struct svc_rqst *rqstp) +static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int len; - struct kvec *vec; - int pnum, vlen; - - dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); + int len; if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* sndbuf needs to have room for one request @@ -861,10 +904,6 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - /* Receive data. If we haven't got the record length yet, get - * the next four bytes. Otherwise try to gobble up as much as - * possible up to the complete record length. - */ if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { int want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; struct kvec iov; @@ -879,7 +918,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: short recvfrom while reading record " "length (%d of %d)\n", len, want); svc_xprt_received(&svsk->sk_xprt); - return -EAGAIN; /* record header not complete */ + goto err_again; /* record header not complete */ } svsk->sk_reclen = ntohl(svsk->sk_reclen); @@ -894,6 +933,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) "per record not supported\n"); goto err_delete; } + svsk->sk_reclen &= RPC_FRAGMENT_SIZE_MASK; dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); if (svsk->sk_reclen > serv->sv_max_mesg) { @@ -914,17 +954,121 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: incomplete TCP record (%d of %d)\n", len, svsk->sk_reclen); svc_xprt_received(&svsk->sk_xprt); - return -EAGAIN; /* record not complete */ + goto err_again; /* record not complete */ } len = svsk->sk_reclen; set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + return len; + error: + if (len == -EAGAIN) { + dprintk("RPC: TCP recv_record got EAGAIN\n"); + svc_xprt_received(&svsk->sk_xprt); + } + return len; + err_delete: + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + err_again: + return -EAGAIN; +} + +static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp, + struct rpc_rqst **reqpp, struct kvec *vec) +{ + struct rpc_rqst *req = NULL; + u32 *p; + u32 xid; + u32 calldir; + int len; + + len = svc_recvfrom(rqstp, vec, 1, 8); + if (len < 0) + goto error; + + p = (u32 *)rqstp->rq_arg.head[0].iov_base; + xid = *p++; + calldir = *p; + + if (calldir == 0) { + /* REQUEST is the most common case */ + vec[0] = rqstp->rq_arg.head[0]; + } else { + /* REPLY */ + if (svsk->sk_bc_xprt) + req = xprt_lookup_rqst(svsk->sk_bc_xprt, xid); + + if (!req) { + printk(KERN_NOTICE + "%s: Got unrecognized reply: " + "calldir 0x%x sk_bc_xprt %p xid %08x\n", + __func__, ntohl(calldir), + svsk->sk_bc_xprt, xid); + vec[0] = rqstp->rq_arg.head[0]; + goto out; + } + + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(struct xdr_buf)); + /* copy the xid and call direction */ + memcpy(req->rq_private_buf.head[0].iov_base, + rqstp->rq_arg.head[0].iov_base, 8); + vec[0] = req->rq_private_buf.head[0]; + } + out: + vec[0].iov_base += 8; + vec[0].iov_len -= 8; + len = svsk->sk_reclen - 8; + error: + *reqpp = req; + return len; +} + +/* + * Receive data from a TCP socket. + */ +static int svc_tcp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int len; + struct kvec *vec; + int pnum, vlen; + struct rpc_rqst *req = NULL; + + dprintk("svc: tcp_recv %p data %d conn %d close %d\n", + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); + + len = svc_tcp_recv_record(svsk, rqstp); + if (len < 0) + goto error; + vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; vlen = PAGE_SIZE; + + /* + * We have enough data for the whole tcp record. Let's try and read the + * first 8 bytes to get the xid and the call direction. We can use this + * to figure out if this is a call or a reply to a callback. If + * sk_reclen is < 8 (xid and calldir), then this is a malformed packet. + * In that case, don't bother with the calldir and just read the data. + * It will be rejected in svc_process. + */ + if (len >= 8) { + len = svc_process_calldir(svsk, rqstp, &req, vec); + if (len < 0) + goto err_again; + vlen -= 8; + } + pnum = 1; while (vlen < len) { - vec[pnum].iov_base = page_address(rqstp->rq_pages[pnum]); + vec[pnum].iov_base = (req) ? + page_address(req->rq_private_buf.pages[pnum - 1]) : + page_address(rqstp->rq_pages[pnum]); vec[pnum].iov_len = PAGE_SIZE; pnum++; vlen += PAGE_SIZE; @@ -934,8 +1078,18 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) /* Now receive data */ len = svc_recvfrom(rqstp, vec, pnum, len); if (len < 0) - goto error; + goto err_again; + /* + * Account for the 8 bytes we read earlier + */ + len += 8; + + if (req) { + xprt_complete_rqst(req->rq_task, len); + len = 0; + goto out; + } dprintk("svc: TCP complete record (%d bytes)\n", len); rqstp->rq_arg.len = len; rqstp->rq_arg.page_base = 0; @@ -949,6 +1103,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; +out: /* Reset TCP read info */ svsk->sk_reclen = 0; svsk->sk_tcplen = 0; @@ -960,21 +1115,19 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) return len; - err_delete: - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - return -EAGAIN; - - error: +err_again: if (len == -EAGAIN) { dprintk("RPC: TCP recvfrom got EAGAIN\n"); svc_xprt_received(&svsk->sk_xprt); - } else { + return len; + } +error: + if (len != -EAGAIN) { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", svsk->sk_xprt.xpt_server->sv_name, -len); - goto err_delete; + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); } - - return len; + return -EAGAIN; } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f412a852bc7..fd46d42afa8 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -832,6 +832,11 @@ static void xprt_timer(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } +static inline int xprt_has_timer(struct rpc_xprt *xprt) +{ + return xprt->idle_timeout != 0; +} + /** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request @@ -1013,7 +1018,7 @@ void xprt_release(struct rpc_task *task) if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; - if (list_empty(&xprt->recv)) + if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); @@ -1082,8 +1087,11 @@ found: #endif /* CONFIG_NFS_V4_1 */ INIT_WORK(&xprt->task_cleanup, xprt_autoclose); - setup_timer(&xprt->timer, xprt_init_autodisconnect, - (unsigned long)xprt); + if (xprt_has_timer(xprt)) + setup_timer(&xprt->timer, xprt_init_autodisconnect, + (unsigned long)xprt); + else + init_timer(&xprt->timer); xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; @@ -1102,7 +1110,6 @@ found: dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); - return xprt; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 5151f9f6c57..0cf5e8c27a1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -730,12 +730,12 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) goto err; mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); - if (!mr) + if (IS_ERR(mr)) goto err_free_frmr; pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, RPCSVC_MAXPAGES); - if (!pl) + if (IS_ERR(pl)) goto err_free_mr; frmr->mr = mr; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 62438f3a914..bee41546575 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -32,6 +32,7 @@ #include <linux/tcp.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> +#include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprtsock.h> #include <linux/file.h> #ifdef CONFIG_NFS_V4_1 @@ -43,6 +44,7 @@ #include <net/udp.h> #include <net/tcp.h> +#include "sunrpc.h" /* * xprtsock tunables */ @@ -2098,6 +2100,134 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) xprt->stat.bklog_u); } +/* + * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason + * we allocate pages instead doing a kmalloc like rpc_malloc is because we want + * to use the server side send routines. + */ +void *bc_malloc(struct rpc_task *task, size_t size) +{ + struct page *page; + struct rpc_buffer *buf; + + BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer)); + page = alloc_page(GFP_KERNEL); + + if (!page) + return NULL; + + buf = page_address(page); + buf->len = PAGE_SIZE; + + return buf->data; +} + +/* + * Free the space allocated in the bc_alloc routine + */ +void bc_free(void *buffer) +{ + struct rpc_buffer *buf; + + if (!buffer) + return; + + buf = container_of(buffer, struct rpc_buffer, data); + free_page((unsigned long)buf); +} + +/* + * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex + * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request. + */ +static int bc_sendto(struct rpc_rqst *req) +{ + int len; + struct xdr_buf *xbufp = &req->rq_snd_buf; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct socket *sock = transport->sock; + unsigned long headoff; + unsigned long tailoff; + + /* + * Set up the rpc header and record marker stuff + */ + xs_encode_tcp_record_marker(xbufp); + + tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK; + headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK; + len = svc_send_common(sock, xbufp, + virt_to_page(xbufp->head[0].iov_base), headoff, + xbufp->tail[0].iov_base, tailoff); + + if (len != xbufp->len) { + printk(KERN_NOTICE "Error sending entire callback!\n"); + len = -EAGAIN; + } + + return len; +} + +/* + * The send routine. Borrows from svc_send + */ +static int bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct svc_xprt *xprt; + struct svc_sock *svsk; + u32 len; + + dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); + /* + * Get the server socket associated with this callback xprt + */ + xprt = req->rq_xprt->bc_xprt; + svsk = container_of(xprt, struct svc_sock, sk_xprt); + + /* + * Grab the mutex to serialize data as the connection is shared + * with the fore channel + */ + if (!mutex_trylock(&xprt->xpt_mutex)) { + rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&xprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task); + } + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = bc_sendto(req); + mutex_unlock(&xprt->xpt_mutex); + + if (len > 0) + len = 0; + + return len; +} + +/* + * The close routine. Since this is client initiated, we do nothing + */ + +static void bc_close(struct rpc_xprt *xprt) +{ + return; +} + +/* + * The xprt destroy routine. Again, because this connection is client + * initiated, we do nothing + */ + +static void bc_destroy(struct rpc_xprt *xprt) +{ + return; +} + static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, @@ -2134,6 +2264,22 @@ static struct rpc_xprt_ops xs_tcp_ops = { .print_stats = xs_tcp_print_stats, }; +/* + * The rpc_xprt_ops for the server backchannel + */ + +static struct rpc_xprt_ops bc_tcp_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, + .buf_alloc = bc_malloc, + .buf_free = bc_free, + .send_request = bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = bc_close, + .destroy = bc_destroy, + .print_stats = xs_tcp_print_stats, +}; + static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, unsigned int slot_table_size) { @@ -2322,11 +2468,93 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) return ERR_PTR(-EINVAL); } +/** + * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket + * @args: rpc transport creation arguments + * + */ +static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) +{ + struct sockaddr *addr = args->dstaddr; + struct rpc_xprt *xprt; + struct sock_xprt *transport; + struct svc_sock *bc_sock; + + if (!args->bc_xprt) + ERR_PTR(-EINVAL); + + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = IPPROTO_TCP; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + xprt->timeout = &xs_tcp_default_timeout; + + /* backchannel */ + xprt_set_bound(xprt); + xprt->bind_timeout = 0; + xprt->connect_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; + + /* + * The backchannel uses the same socket connection as the + * forechannel + */ + xprt->bc_xprt = args->bc_xprt; + bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt); + bc_sock->sk_bc_xprt = xprt; + transport->sock = bc_sock->sk_sock; + transport->inet = bc_sock->sk_sk; + + xprt->ops = &bc_tcp_ops; + + switch (addr->sa_family) { + case AF_INET: + xs_format_peer_addresses(xprt, "tcp", + RPCBIND_NETID_TCP); + break; + case AF_INET6: + xs_format_peer_addresses(xprt, "tcp", + RPCBIND_NETID_TCP6); + break; + default: + kfree(xprt); + return ERR_PTR(-EAFNOSUPPORT); + } + + if (xprt_bound(xprt)) + dprintk("RPC: set up xprt to %s (port %s) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + xprt->address_strings[RPC_DISPLAY_PROTO]); + else + dprintk("RPC: set up xprt to %s (autobind) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PROTO]); + + /* + * Since we don't want connections for the backchannel, we set + * the xprt status to connected + */ + xprt_set_connected(xprt); + + + if (try_module_get(THIS_MODULE)) + return xprt; + kfree(xprt->slot); + kfree(xprt); + return ERR_PTR(-EINVAL); +} + static struct xprt_class xs_udp_transport = { .list = LIST_HEAD_INIT(xs_udp_transport.list), .name = "udp", .owner = THIS_MODULE, - .ident = IPPROTO_UDP, + .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, }; @@ -2334,10 +2562,18 @@ static struct xprt_class xs_tcp_transport = { .list = LIST_HEAD_INIT(xs_tcp_transport.list), .name = "tcp", .owner = THIS_MODULE, - .ident = IPPROTO_TCP, + .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, }; +static struct xprt_class xs_bc_tcp_transport = { + .list = LIST_HEAD_INIT(xs_bc_tcp_transport.list), + .name = "tcp NFSv4.1 backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_TCP, + .setup = xs_setup_bc_tcp, +}; + /** * init_socket_xprt - set up xprtsock's sysctls, register with RPC client * @@ -2351,6 +2587,7 @@ int init_socket_xprt(void) xprt_register_transport(&xs_udp_transport); xprt_register_transport(&xs_tcp_transport); + xprt_register_transport(&xs_bc_tcp_transport); return 0; } @@ -2370,6 +2607,7 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_udp_transport); xprt_unregister_transport(&xs_tcp_transport); + xprt_unregister_transport(&xs_bc_tcp_transport); } static int param_set_uint_minmax(const char *val, struct kernel_param *kp, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 429dd06a4ec..561a45cf2a6 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -834,7 +834,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev, return 0; } - return rdev->ops->set_tx_power(wdev->wiphy, type, dbm);; + return rdev->ops->set_tx_power(wdev->wiphy, type, dbm); } EXPORT_SYMBOL_GPL(cfg80211_wext_siwtxpower); diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c index 72c15205bb2..8ab44861168 100644 --- a/scripts/basic/fixdep.c +++ b/scripts/basic/fixdep.c @@ -16,8 +16,7 @@ * tells make when to remake a file. * * To use this list as-is however has the drawback that virtually - * every file in the kernel includes <linux/config.h> which then again - * includes <linux/autoconf.h> + * every file in the kernel includes <linux/autoconf.h>. * * If the user re-runs make *config, linux/autoconf.h will be * regenerated. make notices that and will rebuild every file which @@ -126,7 +125,6 @@ char *depfile; char *cmdline; void usage(void) - { fprintf(stderr, "Usage: fixdep <depfile> <target> <cmdline>\n"); exit(1); diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2d5ece798c4..87bbb8bce9b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -10,7 +10,7 @@ use strict; my $P = $0; $P =~ s@.*/@@g; -my $V = '0.28'; +my $V = '0.29'; use Getopt::Long qw(:config no_auto_abbrev); @@ -28,6 +28,41 @@ my $mailback = 0; my $summary_file = 0; my $root; my %debug; +my $help = 0; + +sub help { + my ($exitcode) = @_; + + print << "EOM"; +Usage: $P [OPTION]... [FILE]... +Version: $V + +Options: + -q, --quiet quiet + --no-tree run without a kernel tree + --no-signoff do not check for 'Signed-off-by' line + --patch treat FILE as patchfile (default) + --emacs emacs compile window format + --terse one line per report + -f, --file treat FILE as regular source file + --subjective, --strict enable more subjective tests + --root=PATH PATH to the kernel tree root + --no-summary suppress the per-file summary + --mailback only produce a report in case of warnings/errors + --summary-file include the filename in summary + --debug KEY=[0|1] turn on/off debugging of KEY, where KEY is one of + 'values', 'possible', 'type', and 'attr' (default + is all off) + --test-only=WORD report only warnings/errors containing WORD + literally + -h, --help, --version display this help and exit + +When FILE is - read standard input. +EOM + + exit($exitcode); +} + GetOptions( 'q|quiet+' => \$quiet, 'tree!' => \$tree, @@ -35,7 +70,7 @@ GetOptions( 'patch!' => \$chk_patch, 'emacs!' => \$emacs, 'terse!' => \$terse, - 'file!' => \$file, + 'f|file!' => \$file, 'subjective!' => \$check, 'strict!' => \$check, 'root=s' => \$root, @@ -45,22 +80,16 @@ GetOptions( 'debug=s' => \%debug, 'test-only=s' => \$tst_only, -) or exit; + 'h|help' => \$help, + 'version' => \$help +) or help(1); + +help(0) if ($help); my $exit = 0; if ($#ARGV < 0) { - print "usage: $P [options] patchfile\n"; - print "version: $V\n"; - print "options: -q => quiet\n"; - print " --no-tree => run without a kernel tree\n"; - print " --terse => one line per report\n"; - print " --emacs => emacs compile window format\n"; - print " --file => check a source file\n"; - print " --strict => enable more subjective tests\n"; - print " --root => path to the kernel tree root\n"; - print " --no-summary => suppress the per-file summary\n"; - print " --summary-file => include the filename in summary\n"; + print "$P: no input files\n"; exit(1); } @@ -153,7 +182,7 @@ our $UTF8 = qr { }x; our $typeTypedefs = qr{(?x: - (?:__)?(?:u|s|be|le)(?:\d|\d\d)| + (?:__)?(?:u|s|be|le)(?:8|16|32|64)| atomic_t )}; @@ -356,6 +385,13 @@ sub sanitise_line { $off++; next; } + if ($sanitise_quote eq '' && substr($line, $off, 2) eq '//') { + $sanitise_quote = '//'; + + substr($res, $off, 2, $sanitise_quote); + $off++; + next; + } # A \ in a string means ignore the next character. if (($sanitise_quote eq "'" || $sanitise_quote eq '"') && @@ -379,6 +415,8 @@ sub sanitise_line { #print "c<$c> SQ<$sanitise_quote>\n"; if ($off != 0 && $sanitise_quote eq '*/' && $c ne "\t") { substr($res, $off, 1, $;); + } elsif ($off != 0 && $sanitise_quote eq '//' && $c ne "\t") { + substr($res, $off, 1, $;); } elsif ($off != 0 && $sanitise_quote && $c ne "\t") { substr($res, $off, 1, 'X'); } else { @@ -386,6 +424,10 @@ sub sanitise_line { } } + if ($sanitise_quote eq '//') { + $sanitise_quote = ''; + } + # The pathname on a #include may be surrounded by '<' and '>'. if ($res =~ /^.\s*\#\s*include\s+\<(.*)\>/) { my $clean = 'X' x length($1); @@ -1336,6 +1378,18 @@ sub process { WARN("adding a line without newline at end of file\n" . $herecurr); } +# Blackfin: use hi/lo macros + if ($realfile =~ m@arch/blackfin/.*\.S$@) { + if ($line =~ /\.[lL][[:space:]]*=.*&[[:space:]]*0x[fF][fF][fF][fF]/) { + my $herevet = "$here\n" . cat_vet($line) . "\n"; + ERROR("use the LO() macro, not (... & 0xFFFF)\n" . $herevet); + } + if ($line =~ /\.[hH][[:space:]]*=.*>>[[:space:]]*16/) { + my $herevet = "$here\n" . cat_vet($line) . "\n"; + ERROR("use the HI() macro, not (... >> 16)\n" . $herevet); + } + } + # check we are in a valid source file C or perl if not then ignore this hunk next if ($realfile !~ /\.(h|c|pl)$/); @@ -1355,6 +1409,16 @@ sub process { WARN("CVS style keyword markers, these will _not_ be updated\n". $herecurr); } +# Blackfin: don't use __builtin_bfin_[cs]sync + if ($line =~ /__builtin_bfin_csync/) { + my $herevet = "$here\n" . cat_vet($line) . "\n"; + ERROR("use the CSYNC() macro in asm/blackfin.h\n" . $herevet); + } + if ($line =~ /__builtin_bfin_ssync/) { + my $herevet = "$here\n" . cat_vet($line) . "\n"; + ERROR("use the SSYNC() macro in asm/blackfin.h\n" . $herevet); + } + # Check for potential 'bare' types my ($stat, $cond, $line_nr_next, $remain_next, $off_next); if ($realcnt && $line =~ /.\s*\S/) { @@ -1372,6 +1436,8 @@ sub process { # Ignore functions being called } elsif ($s =~ /^.\s*$Ident\s*\(/s) { + } elsif ($s =~ /^.\s*else\b/s) { + # declarations always start with types } elsif ($prev_values eq 'E' && $s =~ /^.\s*(?:$Storage\s+)?(?:$Inline\s+)?(?:const\s+)?((?:\s*$Ident)+?)\b(?:\s+$Sparse)?\s*\**\s*(?:$Ident|\(\*[^\)]*\))(?:\s*$Modifier)?\s*(?:;|=|,|\()/s) { my $type = $1; @@ -1532,8 +1598,9 @@ sub process { $s =~ /^\s*#\s*?/ || $s =~ /^\s*$Ident\s*:/) { $continuation = ($s =~ /^.*?\\\n/) ? 1 : 0; - $s =~ s/^.*?\n//; - $cond_lines++; + if ($s =~ s/^.*?\n//) { + $cond_lines++; + } } } @@ -1891,7 +1958,7 @@ sub process { # A unary '*' may be const } elsif ($ctx =~ /.xW/) { - ERROR("Aspace prohibited after that '$op' $at\n" . $hereptr); + ERROR("space prohibited after that '$op' $at\n" . $hereptr); } # unary ++ and unary -- are allowed no space on one side. @@ -2243,7 +2310,8 @@ sub process { DECLARE_PER_CPU| DEFINE_PER_CPU| __typeof__\(| - \.$Ident\s*=\s* + \.$Ident\s*=\s*| + ^\"|\"$ }x; #print "REST<$rest> dstat<$dstat>\n"; if ($rest ne '') { diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 278a45bd45a..cdb44b63342 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -13,7 +13,7 @@ use strict; my $P = $0; -my $V = '0.17'; +my $V = '0.20'; use Getopt::Long qw(:config no_auto_abbrev); @@ -29,6 +29,8 @@ my $email_git_min_signatures = 1; my $email_git_max_maintainers = 5; my $email_git_min_percent = 5; my $email_git_since = "1-year-ago"; +my $email_git_blame = 0; +my $email_remove_duplicates = 1; my $output_multiline = 1; my $output_separator = ", "; my $scm = 0; @@ -36,6 +38,7 @@ my $web = 0; my $subsystem = 0; my $status = 0; my $from_filename = 0; +my $pattern_depth = 0; my $version = 0; my $help = 0; @@ -68,6 +71,8 @@ if (!GetOptions( 'git-max-maintainers=i' => \$email_git_max_maintainers, 'git-min-percent=i' => \$email_git_min_percent, 'git-since=s' => \$email_git_since, + 'git-blame!' => \$email_git_blame, + 'remove-duplicates!' => \$email_remove_duplicates, 'm!' => \$email_maintainer, 'n!' => \$email_usename, 'l!' => \$email_list, @@ -78,6 +83,7 @@ if (!GetOptions( 'status!' => \$status, 'scm!' => \$scm, 'web!' => \$web, + 'pattern-depth=i' => \$pattern_depth, 'f|file' => \$from_filename, 'v|version' => \$version, 'h|help' => \$help, @@ -101,14 +107,19 @@ if ($#ARGV < 0) { die "$P: argument missing: patchfile or -f file please\n"; } +if ($output_separator ne ", ") { + $output_multiline = 0; +} + my $selections = $email + $scm + $status + $subsystem + $web; if ($selections == 0) { usage(); die "$P: Missing required option: email, scm, status, subsystem or web\n"; } -if ($email && ($email_maintainer + $email_list + $email_subscriber_list - + $email_git + $email_git_penguin_chiefs) == 0) { +if ($email && + ($email_maintainer + $email_list + $email_subscriber_list + + $email_git + $email_git_penguin_chiefs + $email_git_blame) == 0) { usage(); die "$P: Please select at least 1 email option\n"; } @@ -147,9 +158,36 @@ while (<MAINT>) { } close(MAINT); +my %mailmap; + +if ($email_remove_duplicates) { + open(MAILMAP, "<${lk_path}.mailmap") || warn "$P: Can't open .mailmap\n"; + while (<MAILMAP>) { + my $line = $_; + + next if ($line =~ m/^\s*#/); + next if ($line =~ m/^\s*$/); + + my ($name, $address) = parse_email($line); + $line = format_email($name, $address); + + next if ($line =~ m/^\s*$/); + + if (exists($mailmap{$name})) { + my $obj = $mailmap{$name}; + push(@$obj, $address); + } else { + my @arr = ($address); + $mailmap{$name} = \@arr; + } + } + close(MAILMAP); +} + ## use the filenames on the command line or find the filenames in the patchfiles my @files = (); +my @range = (); foreach my $file (@ARGV) { ##if $file is a directory and it lacks a trailing slash, add one @@ -162,13 +200,19 @@ foreach my $file (@ARGV) { push(@files, $file); } else { my $file_cnt = @files; + my $lastfile; open(PATCH, "<$file") or die "$P: Can't open ${file}\n"; while (<PATCH>) { if (m/^\+\+\+\s+(\S+)/) { my $filename = $1; $filename =~ s@^[^/]*/@@; $filename =~ s@\n@@; + $lastfile = $filename; push(@files, $filename); + } elsif (m/^\@\@ -(\d+),(\d+)/) { + if ($email_git_blame) { + push(@range, "$lastfile:$1:$2"); + } } } close(PATCH); @@ -201,6 +245,7 @@ foreach my $file (@files) { if ($type eq 'X') { if (file_match_pattern($file, $value)) { $exclude = 1; + last; } } } @@ -208,35 +253,45 @@ foreach my $file (@files) { if (!$exclude) { my $tvi = 0; + my %hash; foreach my $line (@typevalue) { if ($line =~ m/^(\C):\s*(.*)/) { my $type = $1; my $value = $2; if ($type eq 'F') { if (file_match_pattern($file, $value)) { - add_categories($tvi); + my $value_pd = ($value =~ tr@/@@); + my $file_pd = ($file =~ tr@/@@); + $value_pd++ if (substr($value,-1,1) ne "/"); + if ($pattern_depth == 0 || + (($file_pd - $value_pd) < $pattern_depth)) { + $hash{$tvi} = $value_pd; + } } } } $tvi++; } + foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { + add_categories($line); + } } if ($email && $email_git) { recent_git_signoffs($file); } + if ($email && $email_git_blame) { + git_assign_blame($file); + } } if ($email) { foreach my $chief (@penguin_chief) { if ($chief =~ m/^(.*):(.*)/) { my $email_address; - if ($email_usename) { - $email_address = format_email($1, $2); - } else { - $email_address = $2; - } + + $email_address = format_email($1, $2); if ($email_git_penguin_chiefs) { push(@email_to, $email_address); } else { @@ -258,22 +313,22 @@ if ($email || $email_list) { } if ($scm) { - @scm = sort_and_uniq(@scm); + @scm = uniq(@scm); output(@scm); } if ($status) { - @status = sort_and_uniq(@status); + @status = uniq(@status); output(@status); } if ($subsystem) { - @subsystem = sort_and_uniq(@subsystem); + @subsystem = uniq(@subsystem); output(@subsystem); } if ($web) { - @web = sort_and_uniq(@web); + @web = uniq(@web); output(@web); } @@ -311,10 +366,12 @@ MAINTAINER field selection options: --git-max-maintainers => maximum maintainers to add (default: 5) --git-min-percent => minimum percentage of commits required (default: 5) --git-since => git history to use (default: 1-year-ago) + --git-blame => use git blame to find modified commits for patch or file --m => include maintainer(s) if any --n => include name 'Full Name <addr\@domain.tld>' --l => include list(s) if any --s => include subscriber only list(s) if any + --remove-duplicates => minimize duplicate email names/addresses --scm => print SCM tree(s) if any --status => print status if any --subsystem => print subsystem name if any @@ -322,24 +379,28 @@ MAINTAINER field selection options: Output type options: --separator [, ] => separator for multiple entries on 1 line + using --separator also sets --nomultiline if --separator is not [, ] --multiline => print 1 entry per line -Default options: - [--email --git --m --n --l --multiline] - Other options: + --pattern-depth => Number of pattern directory traversals (default: 0 (all)) --version => show version --help => show this help information +Default options: + [--email --git --m --n --l --multiline --pattern-depth=0 --remove-duplicates] + Notes: Using "-f directory" may give unexpected results: - - Used with "--git", git signators for _all_ files in and below - directory are examined as git recurses directories. - Any specified X: (exclude) pattern matches are _not_ ignored. - Used with "--nogit", directory is used as a pattern match, - no individual file within the directory or subdirectory - is matched. + Used with "--git", git signators for _all_ files in and below + directory are examined as git recurses directories. + Any specified X: (exclude) pattern matches are _not_ ignored. + Used with "--nogit", directory is used as a pattern match, + no individual file within the directory or subdirectory + is matched. + Used with "--git-blame", does not iterate all files in directory + Using "--git-blame" is slow and may add old committers and authors + that are no longer active maintainers to the output. EOT } @@ -370,30 +431,100 @@ sub top_of_kernel_tree { return 0; } -sub format_email { - my ($name, $email) = @_; +sub parse_email { + my ($formatted_email) = @_; + + my $name = ""; + my $address = ""; + + if ($formatted_email =~ /^([^<]+)<(.+\@.*)>.*$/) { + $name = $1; + $address = $2; + } elsif ($formatted_email =~ /^\s*<(.+\@\S*)>.*$/) { + $address = $1; + } elsif ($formatted_email =~ /^(.+\@\S*).*$/) { + $address = $1; + } $name =~ s/^\s+|\s+$//g; $name =~ s/^\"|\"$//g; - $email =~ s/^\s+|\s+$//g; + $address =~ s/^\s+|\s+$//g; + + if ($name =~ /[^a-z0-9 \.\-]/i) { ##has "must quote" chars + $name =~ s/(?<!\\)"/\\"/g; ##escape quotes + $name = "\"$name\""; + } + + return ($name, $address); +} + +sub format_email { + my ($name, $address) = @_; - my $formatted_email = ""; + my $formatted_email; + + $name =~ s/^\s+|\s+$//g; + $name =~ s/^\"|\"$//g; + $address =~ s/^\s+|\s+$//g; if ($name =~ /[^a-z0-9 \.\-]/i) { ##has "must quote" chars $name =~ s/(?<!\\)"/\\"/g; ##escape quotes - $formatted_email = "\"${name}\"\ \<${email}\>"; + $name = "\"$name\""; + } + + if ($email_usename) { + if ("$name" eq "") { + $formatted_email = "$address"; + } else { + $formatted_email = "$name <${address}>"; + } } else { - $formatted_email = "${name} \<${email}\>"; + $formatted_email = $address; } + return $formatted_email; } -sub add_categories { +sub find_starting_index { + my ($index) = @_; - $index = $index - 1; - while ($index >= 0) { + while ($index > 0) { my $tv = $typevalue[$index]; + if (!($tv =~ m/^(\C):\s*(.*)/)) { + last; + } + $index--; + } + + return $index; +} + +sub find_ending_index { + my ($index) = @_; + + while ($index < @typevalue) { + my $tv = $typevalue[$index]; + if (!($tv =~ m/^(\C):\s*(.*)/)) { + last; + } + $index++; + } + + return $index; +} + +sub add_categories { + my ($index) = @_; + + my $i; + my $start = find_starting_index($index); + my $end = find_ending_index($index); + + push(@subsystem, $typevalue[$start]); + + for ($i = $start + 1; $i < $end; $i++) { + my $tv = $typevalue[$i]; if ($tv =~ m/^(\C):\s*(.*)/) { my $ptype = $1; my $pvalue = $2; @@ -414,19 +545,19 @@ sub add_categories { } } } elsif ($ptype eq "M") { - my $p_used = 0; - if ($index >= 0) { - my $tv = $typevalue[$index - 1]; - if ($tv =~ m/^(\C):\s*(.*)/) { - if ($1 eq "P") { - if ($email_usename) { - push_email_address(format_email($2, $pvalue)); - $p_used = 1; + my ($name, $address) = parse_email($pvalue); + if ($name eq "") { + if ($i > 0) { + my $tv = $typevalue[$i - 1]; + if ($tv =~ m/^(\C):\s*(.*)/) { + if ($1 eq "P") { + $name = $2; + $pvalue = format_email($name, $address); } } } } - if (!$p_used) { + if ($email_maintainer) { push_email_addresses($pvalue); } } elsif ($ptype eq "T") { @@ -436,31 +567,41 @@ sub add_categories { } elsif ($ptype eq "S") { push(@status, $pvalue); } - - $index--; - } else { - push(@subsystem,$tv); - $index = -1; } } } +my %email_hash_name; +my %email_hash_address; + +sub email_inuse { + my ($name, $address) = @_; + + return 1 if (($name eq "") && ($address eq "")); + return 1 if (($name ne "") && exists($email_hash_name{$name})); + return 1 if (($address ne "") && exists($email_hash_address{$address})); + + return 0; +} + sub push_email_address { - my ($email_address) = @_; + my ($line) = @_; + + my ($name, $address) = parse_email($line); - my $email_name = ""; - if ($email_address =~ m/([^<]+)<(.*\@.*)>$/) { - $email_name = $1; - $email_address = $2; + if ($address eq "") { + return 0; } - if ($email_maintainer) { - if ($email_usename && $email_name) { - push(@email_to, format_email($email_name, $email_address)); - } else { - push(@email_to, $email_address); - } + if (!$email_remove_duplicates) { + push(@email_to, format_email($name, $address)); + } elsif (!email_inuse($name, $address)) { + push(@email_to, format_email($name, $address)); + $email_hash_name{$name}++; + $email_hash_address{$address}++; } + + return 1; } sub push_email_addresses { @@ -476,7 +617,9 @@ sub push_email_addresses { push_email_address($entry); } } else { - warn("Invalid MAINTAINERS address: '" . $address . "'\n"); + if (!push_email_address($address)) { + warn("Invalid MAINTAINERS address: '" . $address . "'\n"); + } } } @@ -492,6 +635,32 @@ sub which { return ""; } +sub mailmap { + my @lines = @_; + my %hash; + + foreach my $line (@lines) { + my ($name, $address) = parse_email($line); + if (!exists($hash{$name})) { + $hash{$name} = $address; + } elsif ($address ne $hash{$name}) { + $address = $hash{$name}; + $line = format_email($name, $address); + } + if (exists($mailmap{$name})) { + my $obj = $mailmap{$name}; + foreach my $map_address (@$obj) { + if (($map_address eq $address) && + ($map_address ne $hash{$name})) { + $line = format_email($name, $hash{$name}); + } + } + } + } + + return @lines; +} + sub recent_git_signoffs { my ($file) = @_; @@ -500,6 +669,7 @@ sub recent_git_signoffs { my $output = ""; my $count = 0; my @lines = (); + my %hash; my $total_sign_offs; if (which("git") eq "") { @@ -513,52 +683,119 @@ sub recent_git_signoffs { } $cmd = "git log --since=${email_git_since} -- ${file}"; - $cmd .= " | grep -Ei \"^[-_ a-z]+by:.*\\\@.*\$\""; - if (!$email_git_penguin_chiefs) { - $cmd .= " | grep -Ev \"${penguin_chiefs}\""; - } - $cmd .= " | cut -f2- -d\":\""; - $cmd .= " | sort | uniq -c | sort -rn"; $output = `${cmd}`; $output =~ s/^\s*//gm; @lines = split("\n", $output); - $total_sign_offs = 0; + @lines = grep(/^[-_ a-z]+by:.*\@.*$/i, @lines); + if (!$email_git_penguin_chiefs) { + @lines = grep(!/${penguin_chiefs}/i, @lines); + } + # cut -f2- -d":" + s/.*:\s*(.+)\s*/$1/ for (@lines); + + $total_sign_offs = @lines; + + if ($email_remove_duplicates) { + @lines = mailmap(@lines); + } + + @lines = sort(@lines); + + # uniq -c + $hash{$_}++ for @lines; + + # sort -rn + foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { + my $sign_offs = $hash{$line}; + $count++; + last if ($sign_offs < $email_git_min_signatures || + $count > $email_git_max_maintainers || + $sign_offs * 100 / $total_sign_offs < $email_git_min_percent); + push_email_address($line); + } +} + +sub save_commits { + my ($cmd, @commits) = @_; + my $output; + my @lines = (); + + $output = `${cmd}`; + + @lines = split("\n", $output); foreach my $line (@lines) { - if ($line =~ m/([0-9]+)\s+(.*)/) { - $total_sign_offs += $1; - } else { - die("$P: Unexpected git output: ${line}\n"); + if ($line =~ m/^(\w+) /) { + push (@commits, $1); } } + return @commits; +} - foreach my $line (@lines) { - if ($line =~ m/([0-9]+)\s+(.*)/) { - my $sign_offs = $1; - $line = $2; - $count++; - if ($sign_offs < $email_git_min_signatures || - $count > $email_git_max_maintainers || - $sign_offs * 100 / $total_sign_offs < $email_git_min_percent) { - last; - } +sub git_assign_blame { + my ($file) = @_; + + my @lines = (); + my @commits = (); + my $cmd; + my $output; + my %hash; + my $total_sign_offs; + my $count; + + if (@range) { + foreach my $file_range_diff (@range) { + next if (!($file_range_diff =~ m/(.+):(.+):(.+)/)); + my $diff_file = $1; + my $diff_start = $2; + my $diff_length = $3; + next if (!("$file" eq "$diff_file")); + $cmd = "git blame -l -L $diff_start,+$diff_length $file"; + @commits = save_commits($cmd, @commits); } - if ($line =~ m/(.+)<(.+)>/) { - my $git_name = $1; - my $git_addr = $2; - if ($email_usename) { - push(@email_to, format_email($git_name, $git_addr)); - } else { - push(@email_to, $git_addr); - } - } elsif ($line =~ m/<(.+)>/) { - my $git_addr = $1; - push(@email_to, $git_addr); - } else { - push(@email_to, $line); + } else { + if (-f $file) { + $cmd = "git blame -l $file"; + @commits = save_commits($cmd, @commits); + } + } + + $total_sign_offs = 0; + @commits = uniq(@commits); + foreach my $commit (@commits) { + $cmd = "git log -1 ${commit}"; + + $output = `${cmd}`; + $output =~ s/^\s*//gm; + @lines = split("\n", $output); + + @lines = grep(/^[-_ a-z]+by:.*\@.*$/i, @lines); + if (!$email_git_penguin_chiefs) { + @lines = grep(!/${penguin_chiefs}/i, @lines); + } + + # cut -f2- -d":" + s/.*:\s*(.+)\s*/$1/ for (@lines); + + $total_sign_offs += @lines; + + if ($email_remove_duplicates) { + @lines = mailmap(@lines); } + + $hash{$_}++ for @lines; + } + + $count = 0; + foreach my $line (sort {$hash{$b} <=> $hash{$a}} keys %hash) { + my $sign_offs = $hash{$line}; + $count++; + last if ($sign_offs < $email_git_min_signatures || + $count > $email_git_max_maintainers || + $sign_offs * 100 / $total_sign_offs < $email_git_min_percent); + push_email_address($line); } } diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index aadc5223dcd..ecf9c7dc182 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -334,8 +334,6 @@ static int parse_source_files(const char *objfile, struct md4_ctx *md) deps_drivers/net/dummy.o := \ drivers/net/dummy.c \ $(wildcard include/config/net/fastroute.h) \ - include/linux/config.h \ - $(wildcard include/config/h.h) \ include/linux/module.h \ Sum all files in the same dir or subdirs. diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c index 1edab7b4ea8..3136c88eacd 100644 --- a/sound/oss/swarm_cs4297a.c +++ b/sound/oss/swarm_cs4297a.c @@ -110,9 +110,6 @@ static void start_adc(struct cs4297a_state *s); // rather than 64k as some of the games work more responsively. // log base 2( buff sz = 32k). -//static unsigned long defaultorder = 3; -//MODULE_PARM(defaultorder, "i"); - // // Turn on/off debugging compilation by commenting out "#define CSDEBUG" // diff --git a/sound/oss/sys_timer.c b/sound/oss/sys_timer.c index 107534477a2..8db6aefe15e 100644 --- a/sound/oss/sys_timer.c +++ b/sound/oss/sys_timer.c @@ -100,9 +100,6 @@ def_tmr_open(int dev, int mode) curr_tempo = 60; curr_timebase = 100; opened = 1; - - ; - { def_tmr.expires = (1) + jiffies; add_timer(&def_tmr); diff --git a/sound/soc/codecs/wm9081.c b/sound/soc/codecs/wm9081.c index c64e55aa63b..686e5aa9720 100644 --- a/sound/soc/codecs/wm9081.c +++ b/sound/soc/codecs/wm9081.c @@ -1027,7 +1027,7 @@ static int wm9081_hw_params(struct snd_pcm_substream *substream, - wm9081->fs); for (i = 1; i < ARRAY_SIZE(clk_sys_rates); i++) { cur_val = abs((wm9081->sysclk_rate / - clk_sys_rates[i].ratio) - wm9081->fs);; + clk_sys_rates[i].ratio) - wm9081->fs); if (cur_val < best_val) { best = i; best_val = cur_val; diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c index 5b9ed646478..d11a6d7e384 100644 --- a/sound/soc/pxa/pxa-ssp.c +++ b/sound/soc/pxa/pxa-ssp.c @@ -351,7 +351,7 @@ static int pxa_ssp_set_dai_pll(struct snd_soc_dai *cpu_dai, do_div(tmp, freq_out); val = tmp; - val = (val << 16) | 64;; + val = (val << 16) | 64; ssp_write_reg(ssp, SSACDD, val); ssacd |= (0x6 << 4); diff --git a/sound/soc/s3c24xx/s3c24xx_uda134x.c b/sound/soc/s3c24xx/s3c24xx_uda134x.c index 8e79a416db5..c215d32d632 100644 --- a/sound/soc/s3c24xx/s3c24xx_uda134x.c +++ b/sound/soc/s3c24xx/s3c24xx_uda134x.c @@ -67,7 +67,7 @@ static int s3c24xx_uda134x_startup(struct snd_pcm_substream *substream) { int ret = 0; #ifdef ENFORCE_RATES - struct snd_pcm_runtime *runtime = substream->runtime;; + struct snd_pcm_runtime *runtime = substream->runtime; #endif mutex_lock(&clk_lock); |