diff options
258 files changed, 9410 insertions, 1526 deletions
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index e4c38152f7f..a4948591607 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -7,7 +7,7 @@ The CONFIG_RCU_TORTURE_TEST config option is available for all RCU implementations. It creates an rcutorture kernel module that can be loaded to run a torture test. The test periodically outputs status messages via printk(), which can be examined via the dmesg -command (perhaps grepping for "rcutorture"). The test is started +command (perhaps grepping for "torture"). The test is started when the module is loaded, and stops when the module is unloaded. However, actually setting this config option to "y" results in the system @@ -35,6 +35,19 @@ stat_interval The number of seconds between output of torture be printed -only- when the module is unloaded, and this is the default. +shuffle_interval + The number of seconds to keep the test threads affinitied + to a particular subset of the CPUs. Used in conjunction + with test_no_idle_hz. + +test_no_idle_hz Whether or not to test the ability of RCU to operate in + a kernel that disables the scheduling-clock interrupt to + idle CPUs. Boolean parameter, "1" to test, "0" otherwise. + +torture_type The type of RCU to test: "rcu" for the rcu_read_lock() + API, "rcu_bh" for the rcu_read_lock_bh() API, and "srcu" + for the "srcu_read_lock()" API. + verbose Enable debug printk()s. Default is disabled. @@ -42,14 +55,14 @@ OUTPUT The statistics output is as follows: - rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0 - rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915 - rcutorture: Reader Pipe: 1466408 9747 0 0 0 0 0 0 0 0 0 - rcutorture: Reader Batch: 1464477 11678 0 0 0 0 0 0 0 0 - rcutorture: Free-Block Circulation: 1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0 - rcutorture: --- End of test + rcu-torture: --- Start of test: nreaders=16 stat_interval=0 verbose=0 + rcu-torture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915 + rcu-torture: Reader Pipe: 1466408 9747 0 0 0 0 0 0 0 0 0 + rcu-torture: Reader Batch: 1464477 11678 0 0 0 0 0 0 0 0 + rcu-torture: Free-Block Circulation: 1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0 + rcu-torture: --- End of test -The command "dmesg | grep rcutorture:" will extract this information on +The command "dmesg | grep torture:" will extract this information on most systems. On more esoteric configurations, it may be necessary to use other commands to access the output of the printk()s used by the RCU torture test. The printk()s use KERN_ALERT, so they should @@ -115,8 +128,9 @@ The following script may be used to torture RCU: modprobe rcutorture sleep 100 rmmod rcutorture - dmesg | grep rcutorture: + dmesg | grep torture: The output can be manually inspected for the error flag of "!!!". One could of course create a more elaborate script that automatically -checked for such errors. +checked for such errors. The "rmmod" command forces a "SUCCESS" or +"FAILURE" indication to be printk()ed. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2e352a605fc..0d189c93eea 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1669,6 +1669,10 @@ running once the system is up. usbhid.mousepoll= [USBHID] The interval which mice are to be polled at. + vdso= [IA-32] + vdso=1: enable VDSO (default) + vdso=0: disable VDSO mapping + video= [FB] Frame buffer configuration See Documentation/fb/modedb.txt. diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt new file mode 100644 index 00000000000..5d61dacd21f --- /dev/null +++ b/Documentation/pi-futex.txt @@ -0,0 +1,121 @@ +Lightweight PI-futexes +---------------------- + +We are calling them lightweight for 3 reasons: + + - in the user-space fastpath a PI-enabled futex involves no kernel work + (or any other PI complexity) at all. No registration, no extra kernel + calls - just pure fast atomic ops in userspace. + + - even in the slowpath, the system call and scheduling pattern is very + similar to normal futexes. + + - the in-kernel PI implementation is streamlined around the mutex + abstraction, with strict rules that keep the implementation + relatively simple: only a single owner may own a lock (i.e. no + read-write lock support), only the owner may unlock a lock, no + recursive locking, etc. + +Priority Inheritance - why? +--------------------------- + +The short reply: user-space PI helps achieving/improving determinism for +user-space applications. In the best-case, it can help achieve +determinism and well-bound latencies. Even in the worst-case, PI will +improve the statistical distribution of locking related application +delays. + +The longer reply: +----------------- + +Firstly, sharing locks between multiple tasks is a common programming +technique that often cannot be replaced with lockless algorithms. As we +can see it in the kernel [which is a quite complex program in itself], +lockless structures are rather the exception than the norm - the current +ratio of lockless vs. locky code for shared data structures is somewhere +between 1:10 and 1:100. Lockless is hard, and the complexity of lockless +algorithms often endangers to ability to do robust reviews of said code. +I.e. critical RT apps often choose lock structures to protect critical +data structures, instead of lockless algorithms. Furthermore, there are +cases (like shared hardware, or other resource limits) where lockless +access is mathematically impossible. + +Media players (such as Jack) are an example of reasonable application +design with multiple tasks (with multiple priority levels) sharing +short-held locks: for example, a highprio audio playback thread is +combined with medium-prio construct-audio-data threads and low-prio +display-colory-stuff threads. Add video and decoding to the mix and +we've got even more priority levels. + +So once we accept that synchronization objects (locks) are an +unavoidable fact of life, and once we accept that multi-task userspace +apps have a very fair expectation of being able to use locks, we've got +to think about how to offer the option of a deterministic locking +implementation to user-space. + +Most of the technical counter-arguments against doing priority +inheritance only apply to kernel-space locks. But user-space locks are +different, there we cannot disable interrupts or make the task +non-preemptible in a critical section, so the 'use spinlocks' argument +does not apply (user-space spinlocks have the same priority inversion +problems as other user-space locking constructs). Fact is, pretty much +the only technique that currently enables good determinism for userspace +locks (such as futex-based pthread mutexes) is priority inheritance: + +Currently (without PI), if a high-prio and a low-prio task shares a lock +[this is a quite common scenario for most non-trivial RT applications], +even if all critical sections are coded carefully to be deterministic +(i.e. all critical sections are short in duration and only execute a +limited number of instructions), the kernel cannot guarantee any +deterministic execution of the high-prio task: any medium-priority task +could preempt the low-prio task while it holds the shared lock and +executes the critical section, and could delay it indefinitely. + +Implementation: +--------------- + +As mentioned before, the userspace fastpath of PI-enabled pthread +mutexes involves no kernel work at all - they behave quite similarly to +normal futex-based locks: a 0 value means unlocked, and a value==TID +means locked. (This is the same method as used by list-based robust +futexes.) Userspace uses atomic ops to lock/unlock these mutexes without +entering the kernel. + +To handle the slowpath, we have added two new futex ops: + + FUTEX_LOCK_PI + FUTEX_UNLOCK_PI + +If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to +TID fails], then FUTEX_LOCK_PI is called. The kernel does all the +remaining work: if there is no futex-queue attached to the futex address +yet then the code looks up the task that owns the futex [it has put its +own TID into the futex value], and attaches a 'PI state' structure to +the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware, +kernel-based synchronization object. The 'other' task is made the owner +of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the +futex value. Then this task tries to lock the rt-mutex, on which it +blocks. Once it returns, it has the mutex acquired, and it sets the +futex value to its own TID and returns. Userspace has no other work to +perform - it now owns the lock, and futex value contains +FUTEX_WAITERS|TID. + +If the unlock side fastpath succeeds, [i.e. userspace manages to do a +TID -> 0 atomic transition of the futex value], then no kernel work is +triggered. + +If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), +then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the +behalf of userspace - and it also unlocks the attached +pi_state->rt_mutex and thus wakes up any potential waiters. + +Note that under this approach, contrary to previous PI-futex approaches, +there is no prior 'registration' of a PI-futex. [which is not quite +possible anyway, due to existing ABI properties of pthread mutexes.] + +Also, under this scheme, 'robustness' and 'PI' are two orthogonal +properties of futexes, and all four combinations are possible: futex, +robust-futex, PI-futex, robust+PI-futex. + +More details about priority inheritance can be found in +Documentation/rtmutex.txt. diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt index df82d75245a..76e8064b8c3 100644 --- a/Documentation/robust-futexes.txt +++ b/Documentation/robust-futexes.txt @@ -95,7 +95,7 @@ comparison. If the thread has registered a list, then normally the list is empty. If the thread/process crashed or terminated in some incorrect way then the list might be non-empty: in this case the kernel carefully walks the list [not trusting it], and marks all locks that are owned by -this thread with the FUTEX_OWNER_DEAD bit, and wakes up one waiter (if +this thread with the FUTEX_OWNER_DIED bit, and wakes up one waiter (if any). The list is guaranteed to be private and per-thread at do_exit() time, diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt new file mode 100644 index 00000000000..c472ffacc2f --- /dev/null +++ b/Documentation/rt-mutex-design.txt @@ -0,0 +1,781 @@ +# +# Copyright (c) 2006 Steven Rostedt +# Licensed under the GNU Free Documentation License, Version 1.2 +# + +RT-mutex implementation design +------------------------------ + +This document tries to describe the design of the rtmutex.c implementation. +It doesn't describe the reasons why rtmutex.c exists. For that please see +Documentation/rt-mutex.txt. Although this document does explain problems +that happen without this code, but that is in the concept to understand +what the code actually is doing. + +The goal of this document is to help others understand the priority +inheritance (PI) algorithm that is used, as well as reasons for the +decisions that were made to implement PI in the manner that was done. + + +Unbounded Priority Inversion +---------------------------- + +Priority inversion is when a lower priority process executes while a higher +priority process wants to run. This happens for several reasons, and +most of the time it can't be helped. Anytime a high priority process wants +to use a resource that a lower priority process has (a mutex for example), +the high priority process must wait until the lower priority process is done +with the resource. This is a priority inversion. What we want to prevent +is something called unbounded priority inversion. That is when the high +priority process is prevented from running by a lower priority process for +an undetermined amount of time. + +The classic example of unbounded priority inversion is were you have three +processes, let's call them processes A, B, and C, where A is the highest +priority process, C is the lowest, and B is in between. A tries to grab a lock +that C owns and must wait and lets C run to release the lock. But in the +meantime, B executes, and since B is of a higher priority than C, it preempts C, +but by doing so, it is in fact preempting A which is a higher priority process. +Now there's no way of knowing how long A will be sleeping waiting for C +to release the lock, because for all we know, B is a CPU hog and will +never give C a chance to release the lock. This is called unbounded priority +inversion. + +Here's a little ASCII art to show the problem. + + grab lock L1 (owned by C) + | +A ---+ + C preempted by B + | +C +----+ + +B +--------> + B now keeps A from running. + + +Priority Inheritance (PI) +------------------------- + +There are several ways to solve this issue, but other ways are out of scope +for this document. Here we only discuss PI. + +PI is where a process inherits the priority of another process if the other +process blocks on a lock owned by the current process. To make this easier +to understand, let's use the previous example, with processes A, B, and C again. + +This time, when A blocks on the lock owned by C, C would inherit the priority +of A. So now if B becomes runnable, it would not preempt C, since C now has +the high priority of A. As soon as C releases the lock, it loses its +inherited priority, and A then can continue with the resource that C had. + +Terminology +----------- + +Here I explain some terminology that is used in this document to help describe +the design that is used to implement PI. + +PI chain - The PI chain is an ordered series of locks and processes that cause + processes to inherit priorities from a previous process that is + blocked on one of its locks. This is described in more detail + later in this document. + +mutex - In this document, to differentiate from locks that implement + PI and spin locks that are used in the PI code, from now on + the PI locks will be called a mutex. + +lock - In this document from now on, I will use the term lock when + referring to spin locks that are used to protect parts of the PI + algorithm. These locks disable preemption for UP (when + CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from + entering critical sections simultaneously. + +spin lock - Same as lock above. + +waiter - A waiter is a struct that is stored on the stack of a blocked + process. Since the scope of the waiter is within the code for + a process being blocked on the mutex, it is fine to allocate + the waiter on the process's stack (local variable). This + structure holds a pointer to the task, as well as the mutex that + the task is blocked on. It also has the plist node structures to + place the task in the waiter_list of a mutex as well as the + pi_list of a mutex owner task (described below). + + waiter is sometimes used in reference to the task that is waiting + on a mutex. This is the same as waiter->task. + +waiters - A list of processes that are blocked on a mutex. + +top waiter - The highest priority process waiting on a specific mutex. + +top pi waiter - The highest priority process waiting on one of the mutexes + that a specific process owns. + +Note: task and process are used interchangeably in this document, mostly to + differentiate between two processes that are being described together. + + +PI chain +-------- + +The PI chain is a list of processes and mutexes that may cause priority +inheritance to take place. Multiple chains may converge, but a chain +would never diverge, since a process can't be blocked on more than one +mutex at a time. + +Example: + + Process: A, B, C, D, E + Mutexes: L1, L2, L3, L4 + + A owns: L1 + B blocked on L1 + B owns L2 + C blocked on L2 + C owns L3 + D blocked on L3 + D owns L4 + E blocked on L4 + +The chain would be: + + E->L4->D->L3->C->L2->B->L1->A + +To show where two chains merge, we could add another process F and +another mutex L5 where B owns L5 and F is blocked on mutex L5. + +The chain for F would be: + + F->L5->B->L1->A + +Since a process may own more than one mutex, but never be blocked on more than +one, the chains merge. + +Here we show both chains: + + E->L4->D->L3->C->L2-+ + | + +->B->L1->A + | + F->L5-+ + +For PI to work, the processes at the right end of these chains (or we may +also call it the Top of the chain) must be equal to or higher in priority +than the processes to the left or below in the chain. + +Also since a mutex may have more than one process blocked on it, we can +have multiple chains merge at mutexes. If we add another process G that is +blocked on mutex L2: + + G->L2->B->L1->A + +And once again, to show how this can grow I will show the merging chains +again. + + E->L4->D->L3->C-+ + +->L2-+ + | | + G-+ +->B->L1->A + | + F->L5-+ + + +Plist +----- + +Before I go further and talk about how the PI chain is stored through lists +on both mutexes and processes, I'll explain the plist. This is similar to +the struct list_head functionality that is already in the kernel. +The implementation of plist is out of scope for this document, but it is +very important to understand what it does. + +There are a few differences between plist and list, the most important one +being that plist is a priority sorted linked list. This means that the +priorities of the plist are sorted, such that it takes O(1) to retrieve the +highest priority item in the list. Obviously this is useful to store processes +based on their priorities. + +Another difference, which is important for implementation, is that, unlike +list, the head of the list is a different element than the nodes of a list. +So the head of the list is declared as struct plist_head and nodes that will +be added to the list are declared as struct plist_node. + + +Mutex Waiter List +----------------- + +Every mutex keeps track of all the waiters that are blocked on itself. The mutex +has a plist to store these waiters by priority. This list is protected by +a spin lock that is located in the struct of the mutex. This lock is called +wait_lock. Since the modification of the waiter list is never done in +interrupt context, the wait_lock can be taken without disabling interrupts. + + +Task PI List +------------ + +To keep track of the PI chains, each process has its own PI list. This is +a list of all top waiters of the mutexes that are owned by the process. +Note that this list only holds the top waiters and not all waiters that are +blocked on mutexes owned by the process. + +The top of the task's PI list is always the highest priority task that +is waiting on a mutex that is owned by the task. So if the task has +inherited a priority, it will always be the priority of the task that is +at the top of this list. + +This list is stored in the task structure of a process as a plist called +pi_list. This list is protected by a spin lock also in the task structure, +called pi_lock. This lock may also be taken in interrupt context, so when +locking the pi_lock, interrupts must be disabled. + + +Depth of the PI Chain +--------------------- + +The maximum depth of the PI chain is not dynamic, and could actually be +defined. But is very complex to figure it out, since it depends on all +the nesting of mutexes. Let's look at the example where we have 3 mutexes, +L1, L2, and L3, and four separate functions func1, func2, func3 and func4. +The following shows a locking order of L1->L2->L3, but may not actually +be directly nested that way. + +void func1(void) +{ + mutex_lock(L1); + + /* do anything */ + + mutex_unlock(L1); +} + +void func2(void) +{ + mutex_lock(L1); + mutex_lock(L2); + + /* do something */ + + mutex_unlock(L2); + mutex_unlock(L1); +} + +void func3(void) +{ + mutex_lock(L2); + mutex_lock(L3); + + /* do something else */ + + mutex_unlock(L3); + mutex_unlock(L2); +} + +void func4(void) +{ + mutex_lock(L3); + + /* do something again */ + + mutex_unlock(L3); +} + +Now we add 4 processes that run each of these functions separately. +Processes A, B, C, and D which run functions func1, func2, func3 and func4 +respectively, and such that D runs first and A last. With D being preempted +in func4 in the "do something again" area, we have a locking that follows: + +D owns L3 + C blocked on L3 + C owns L2 + B blocked on L2 + B owns L1 + A blocked on L1 + +And thus we have the chain A->L1->B->L2->C->L3->D. + +This gives us a PI depth of 4 (four processes), but looking at any of the +functions individually, it seems as though they only have at most a locking +depth of two. So, although the locking depth is defined at compile time, +it still is very difficult to find the possibilities of that depth. + +Now since mutexes can be defined by user-land applications, we don't want a DOS +type of application that nests large amounts of mutexes to create a large +PI chain, and have the code holding spin locks while looking at a large +amount of data. So to prevent this, the implementation not only implements +a maximum lock depth, but also only holds at most two different locks at a +time, as it walks the PI chain. More about this below. + + +Mutex owner and flags +--------------------- + +The mutex structure contains a pointer to the owner of the mutex. If the +mutex is not owned, this owner is set to NULL. Since all architectures +have the task structure on at least a four byte alignment (and if this is +not true, the rtmutex.c code will be broken!), this allows for the two +least significant bits to be used as flags. This part is also described +in Documentation/rt-mutex.txt, but will also be briefly described here. + +Bit 0 is used as the "Pending Owner" flag. This is described later. +Bit 1 is used as the "Has Waiters" flags. This is also described later + in more detail, but is set whenever there are waiters on a mutex. + + +cmpxchg Tricks +-------------- + +Some architectures implement an atomic cmpxchg (Compare and Exchange). This +is used (when applicable) to keep the fast path of grabbing and releasing +mutexes short. + +cmpxchg is basically the following function performed atomically: + +unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) +{ + unsigned long T = *A; + if (*A == *B) { + *A = *C; + } + return T; +} +#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) + +This is really nice to have, since it allows you to only update a variable +if the variable is what you expect it to be. You know if it succeeded if +the return value (the old value of A) is equal to B. + +The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If +the architecture does not support CMPXCHG, then this macro is simply set +to fail every time. But if CMPXCHG is supported, then this will +help out extremely to keep the fast path short. + +The use of rt_mutex_cmpxchg with the flags in the owner field help optimize +the system for architectures that support it. This will also be explained +later in this document. + + +Priority adjustments +-------------------- + +The implementation of the PI code in rtmutex.c has several places that a +process must adjust its priority. With the help of the pi_list of a +process this is rather easy to know what needs to be adjusted. + +The functions implementing the task adjustments are rt_mutex_adjust_prio, +__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock +to already be taken), rt_mutex_get_prio, and rt_mutex_setprio. + +rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. + +rt_mutex_getprio returns the priority that the task should have. Either the +task's own normal priority, or if a process of a higher priority is waiting on +a mutex owned by the task, then that higher priority should be returned. +Since the pi_list of a task holds an order by priority list of all the top +waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs +to compare the top pi waiter to its own normal priority, and return the higher +priority back. + +(Note: if looking at the code, you will notice that the lower number of + prio is returned. This is because the prio field in the task structure + is an inverse order of the actual priority. So a "prio" of 5 is + of higher priority than a "prio" of 10.) + +__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the +result does not equal the task's current priority, then rt_mutex_setprio +is called to adjust the priority of the task to the new priority. +Note that rt_mutex_setprio is defined in kernel/sched.c to implement the +actual change in priority. + +It is interesting to note that __rt_mutex_adjust_prio can either increase +or decrease the priority of the task. In the case that a higher priority +process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio +would increase/boost the task's priority. But if a higher priority task +were for some reason to leave the mutex (timeout or signal), this same function +would decrease/unboost the priority of the task. That is because the pi_list +always contains the highest priority task that is waiting on a mutex owned +by the task, so we only need to compare the priority of that top pi waiter +to the normal priority of the given task. + + +High level overview of the PI chain walk +---------------------------------------- + +The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. + +The implementation has gone through several iterations, and has ended up +with what we believe is the best. It walks the PI chain by only grabbing +at most two locks at a time, and is very efficient. + +The rt_mutex_adjust_prio_chain can be used either to boost or lower process +priorities. + +rt_mutex_adjust_prio_chain is called with a task to be checked for PI +(de)boosting (the owner of a mutex that a process is blocking on), a flag to +check for deadlocking, the mutex that the task owns, and a pointer to a waiter +that is the process's waiter struct that is blocked on the mutex (although this +parameter may be NULL for deboosting). + +For this explanation, I will not mention deadlock detection. This explanation +will try to stay at a high level. + +When this function is called, there are no locks held. That also means +that the state of the owner and lock can change when entered into this function. + +Before this function is called, the task has already had rt_mutex_adjust_prio +performed on it. This means that the task is set to the priority that it +should be at, but the plist nodes of the task's waiter have not been updated +with the new priorities, and that this task may not be in the proper locations +in the pi_lists and wait_lists that the task is blocked on. This function +solves all that. + +A loop is entered, where task is the owner to be checked for PI changes that +was passed by parameter (for the first iteration). The pi_lock of this task is +taken to prevent any more changes to the pi_list of the task. This also +prevents new tasks from completing the blocking on a mutex that is owned by this +task. + +If the task is not blocked on a mutex then the loop is exited. We are at +the top of the PI chain. + +A check is now done to see if the original waiter (the process that is blocked +on the current mutex) is the top pi waiter of the task. That is, is this +waiter on the top of the task's pi_list. If it is not, it either means that +there is another process higher in priority that is blocked on one of the +mutexes that the task owns, or that the waiter has just woken up via a signal +or timeout and has left the PI chain. In either case, the loop is exited, since +we don't need to do any more changes to the priority of the current task, or any +task that owns a mutex that this current task is waiting on. A priority chain +walk is only needed when a new top pi waiter is made to a task. + +The next check sees if the task's waiter plist node has the priority equal to +the priority the task is set at. If they are equal, then we are done with +the loop. Remember that the function started with the priority of the +task adjusted, but the plist nodes that hold the task in other processes +pi_lists have not been adjusted. + +Next, we look at the mutex that the task is blocked on. The mutex's wait_lock +is taken. This is done by a spin_trylock, because the locking order of the +pi_lock and wait_lock goes in the opposite direction. If we fail to grab the +lock, the pi_lock is released, and we restart the loop. + +Now that we have both the pi_lock of the task as well as the wait_lock of +the mutex the task is blocked on, we update the task's waiter's plist node +that is located on the mutex's wait_list. + +Now we release the pi_lock of the task. + +Next the owner of the mutex has its pi_lock taken, so we can update the +task's entry in the owner's pi_list. If the task is the highest priority +process on the mutex's wait_list, then we remove the previous top waiter +from the owner's pi_list, and replace it with the task. + +Note: It is possible that the task was the current top waiter on the mutex, + in which case the task is not yet on the pi_list of the waiter. This + is OK, since plist_del does nothing if the plist node is not on any + list. + +If the task was not the top waiter of the mutex, but it was before we +did the priority updates, that means we are deboosting/lowering the +task. In this case, the task is removed from the pi_list of the owner, +and the new top waiter is added. + +Lastly, we unlock both the pi_lock of the task, as well as the mutex's +wait_lock, and continue the loop again. On the next iteration of the +loop, the previous owner of the mutex will be the task that will be +processed. + +Note: One might think that the owner of this mutex might have changed + since we just grab the mutex's wait_lock. And one could be right. + The important thing to remember is that the owner could not have + become the task that is being processed in the PI chain, since + we have taken that task's pi_lock at the beginning of the loop. + So as long as there is an owner of this mutex that is not the same + process as the tasked being worked on, we are OK. + + Looking closely at the code, one might be confused. The check for the + end of the PI chain is when the task isn't blocked on anything or the + task's waiter structure "task" element is NULL. This check is + protected only by the task's pi_lock. But the code to unlock the mutex + sets the task's waiter structure "task" element to NULL with only + the protection of the mutex's wait_lock, which was not taken yet. + Isn't this a race condition if the task becomes the new owner? + + The answer is No! The trick is the spin_trylock of the mutex's + wait_lock. If we fail that lock, we release the pi_lock of the + task and continue the loop, doing the end of PI chain check again. + + In the code to release the lock, the wait_lock of the mutex is held + the entire time, and it is not let go when we grab the pi_lock of the + new owner of the mutex. So if the switch of a new owner were to happen + after the check for end of the PI chain and the grabbing of the + wait_lock, the unlocking code would spin on the new owner's pi_lock + but never give up the wait_lock. So the PI chain loop is guaranteed to + fail the spin_trylock on the wait_lock, release the pi_lock, and + try again. + + If you don't quite understand the above, that's OK. You don't have to, + unless you really want to make a proof out of it ;) + + +Pending Owners and Lock stealing +-------------------------------- + +One of the flags in the owner field of the mutex structure is "Pending Owner". +What this means is that an owner was chosen by the process releasing the +mutex, but that owner has yet to wake up and actually take the mutex. + +Why is this important? Why can't we just give the mutex to another process +and be done with it? + +The PI code is to help with real-time processes, and to let the highest +priority process run as long as possible with little latencies and delays. +If a high priority process owns a mutex that a lower priority process is +blocked on, when the mutex is released it would be given to the lower priority +process. What if the higher priority process wants to take that mutex again. +The high priority process would fail to take that mutex that it just gave up +and it would need to boost the lower priority process to run with full +latency of that critical section (since the low priority process just entered +it). + +There's no reason a high priority process that gives up a mutex should be +penalized if it tries to take that mutex again. If the new owner of the +mutex has not woken up yet, there's no reason that the higher priority process +could not take that mutex away. + +To solve this, we introduced Pending Ownership and Lock Stealing. When a +new process is given a mutex that it was blocked on, it is only given +pending ownership. This means that it's the new owner, unless a higher +priority process comes in and tries to grab that mutex. If a higher priority +process does come along and wants that mutex, we let the higher priority +process "steal" the mutex from the pending owner (only if it is still pending) +and continue with the mutex. + + +Taking of a mutex (The walk through) +------------------------------------ + +OK, now let's take a look at the detailed walk through of what happens when +taking a mutex. + +The first thing that is tried is the fast taking of the mutex. This is +done when we have CMPXCHG enabled (otherwise the fast taking automatically +fails). Only when the owner field of the mutex is NULL can the lock be +taken with the CMPXCHG and nothing else needs to be done. + +If there is contention on the lock, whether it is owned or pending owner +we go about the slow path (rt_mutex_slowlock). + +The slow path function is where the task's waiter structure is created on +the stack. This is because the waiter structure is only needed for the +scope of this function. The waiter structure holds the nodes to store +the task on the wait_list of the mutex, and if need be, the pi_list of +the owner. + +The wait_lock of the mutex is taken since the slow path of unlocking the +mutex also takes this lock. + +We then call try_to_take_rt_mutex. This is where the architecture that +does not implement CMPXCHG would always grab the lock (if there's no +contention). + +try_to_take_rt_mutex is used every time the task tries to grab a mutex in the +slow path. The first thing that is done here is an atomic setting of +the "Has Waiters" flag of the mutex's owner field. Yes, this could really +be false, because if the the mutex has no owner, there are no waiters and +the current task also won't have any waiters. But we don't have the lock +yet, so we assume we are going to be a waiter. The reason for this is to +play nice for those architectures that do have CMPXCHG. By setting this flag +now, the owner of the mutex can't release the mutex without going into the +slow unlock path, and it would then need to grab the wait_lock, which this +code currently holds. So setting the "Has Waiters" flag forces the owner +to synchronize with this code. + +Now that we know that we can't have any races with the owner releasing the +mutex, we check to see if we can take the ownership. This is done if the +mutex doesn't have a owner, or if we can steal the mutex from a pending +owner. Let's look at the situations we have here. + + 1) Has owner that is pending + ---------------------------- + + The mutex has a owner, but it hasn't woken up and the mutex flag + "Pending Owner" is set. The first check is to see if the owner isn't the + current task. This is because this function is also used for the pending + owner to grab the mutex. When a pending owner wakes up, it checks to see + if it can take the mutex, and this is done if the owner is already set to + itself. If so, we succeed and leave the function, clearing the "Pending + Owner" bit. + + If the pending owner is not current, we check to see if the current priority is + higher than the pending owner. If not, we fail the function and return. + + There's also something special about a pending owner. That is a pending owner + is never blocked on a mutex. So there is no PI chain to worry about. It also + means that if the mutex doesn't have any waiters, there's no accounting needed + to update the pending owner's pi_list, since we only worry about processes + blocked on the current mutex. + + If there are waiters on this mutex, and we just stole the ownership, we need + to take the top waiter, remove it from the pi_list of the pending owner, and + add it to the current pi_list. Note that at this moment, the pending owner + is no longer on the list of waiters. This is fine, since the pending owner + would add itself back when it realizes that it had the ownership stolen + from itself. When the pending owner tries to grab the mutex, it will fail + in try_to_take_rt_mutex if the owner field points to another process. + + 2) No owner + ----------- + + If there is no owner (or we successfully stole the lock), we set the owner + of the mutex to current, and set the flag of "Has Waiters" if the current + mutex actually has waiters, or we clear the flag if it doesn't. See, it was + OK that we set that flag early, since now it is cleared. + + 3) Failed to grab ownership + --------------------------- + + The most interesting case is when we fail to take ownership. This means that + there exists an owner, or there's a pending owner with equal or higher + priority than the current task. + +We'll continue on the failed case. + +If the mutex has a timeout, we set up a timer to go off to break us out +of this mutex if we failed to get it after a specified amount of time. + +Now we enter a loop that will continue to try to take ownership of the mutex, or +fail from a timeout or signal. + +Once again we try to take the mutex. This will usually fail the first time +in the loop, since it had just failed to get the mutex. But the second time +in the loop, this would likely succeed, since the task would likely be +the pending owner. + +If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done +here. + +The waiter structure has a "task" field that points to the task that is blocked +on the mutex. This field can be NULL the first time it goes through the loop +or if the task is a pending owner and had it's mutex stolen. If the "task" +field is NULL then we need to set up the accounting for it. + +Task blocks on mutex +-------------------- + +The accounting of a mutex and process is done with the waiter structure of +the process. The "task" field is set to the process, and the "lock" field +to the mutex. The plist nodes are initialized to the processes current +priority. + +Since the wait_lock was taken at the entry of the slow lock, we can safely +add the waiter to the wait_list. If the current process is the highest +priority process currently waiting on this mutex, then we remove the +previous top waiter process (if it exists) from the pi_list of the owner, +and add the current process to that list. Since the pi_list of the owner +has changed, we call rt_mutex_adjust_prio on the owner to see if the owner +should adjust its priority accordingly. + +If the owner is also blocked on a lock, and had its pi_list changed +(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead +and run rt_mutex_adjust_prio_chain on the owner, as described earlier. + +Now all locks are released, and if the current process is still blocked on a +mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). + +Waking up in the loop +--------------------- + +The schedule can then wake up for a few reasons. + 1) we were given pending ownership of the mutex. + 2) we received a signal and was TASK_INTERRUPTIBLE + 3) we had a timeout and was TASK_INTERRUPTIBLE + +In any of these cases, we continue the loop and once again try to grab the +ownership of the mutex. If we succeed, we exit the loop, otherwise we continue +and on signal and timeout, will exit the loop, or if we had the mutex stolen +we just simply add ourselves back on the lists and go back to sleep. + +Note: For various reasons, because of timeout and signals, the steal mutex + algorithm needs to be careful. This is because the current process is + still on the wait_list. And because of dynamic changing of priorities, + especially on SCHED_OTHER tasks, the current process can be the + highest priority task on the wait_list. + +Failed to get mutex on Timeout or Signal +---------------------------------------- + +If a timeout or signal occurred, the waiter's "task" field would not be +NULL and the task needs to be taken off the wait_list of the mutex and perhaps +pi_list of the owner. If this process was a high priority process, then +the rt_mutex_adjust_prio_chain needs to be executed again on the owner, +but this time it will be lowering the priorities. + + +Unlocking the Mutex +------------------- + +The unlocking of a mutex also has a fast path for those architectures with +CMPXCHG. Since the taking of a mutex on contention always sets the +"Has Waiters" flag of the mutex's owner, we use this to know if we need to +take the slow path when unlocking the mutex. If the mutex doesn't have any +waiters, the owner field of the mutex would equal the current process and +the mutex can be unlocked by just replacing the owner field with NULL. + +If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), +the slow unlock path is taken. + +The first thing done in the slow unlock path is to take the wait_lock of the +mutex. This synchronizes the locking and unlocking of the mutex. + +A check is made to see if the mutex has waiters or not. On architectures that +do not have CMPXCHG, this is the location that the owner of the mutex will +determine if a waiter needs to be awoken or not. On architectures that +do have CMPXCHG, that check is done in the fast path, but it is still needed +in the slow path too. If a waiter of a mutex woke up because of a signal +or timeout between the time the owner failed the fast path CMPXCHG check and +the grabbing of the wait_lock, the mutex may not have any waiters, thus the +owner still needs to make this check. If there are no waiters than the mutex +owner field is set to NULL, the wait_lock is released and nothing more is +needed. + +If there are waiters, then we need to wake one up and give that waiter +pending ownership. + +On the wake up code, the pi_lock of the current owner is taken. The top +waiter of the lock is found and removed from the wait_list of the mutex +as well as the pi_list of the current owner. The task field of the new +pending owner's waiter structure is set to NULL, and the owner field of the +mutex is set to the new owner with the "Pending Owner" bit set, as well +as the "Has Waiters" bit if there still are other processes blocked on the +mutex. + +The pi_lock of the previous owner is released, and the new pending owner's +pi_lock is taken. Remember that this is the trick to prevent the race +condition in rt_mutex_adjust_prio_chain from adding itself as a waiter +on the mutex. + +We now clear the "pi_blocked_on" field of the new pending owner, and if +the mutex still has waiters pending, we add the new top waiter to the pi_list +of the pending owner. + +Finally we unlock the pi_lock of the pending owner and wake it up. + + +Contact +------- + +For updates on this document, please email Steven Rostedt <rostedt@goodmis.org> + + +Credits +------- + +Author: Steven Rostedt <rostedt@goodmis.org> + +Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap + +Updates +------- + +This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt new file mode 100644 index 00000000000..243393d882e --- /dev/null +++ b/Documentation/rt-mutex.txt @@ -0,0 +1,79 @@ +RT-mutex subsystem with PI support +---------------------------------- + +RT-mutexes with priority inheritance are used to support PI-futexes, +which enable pthread_mutex_t priority inheritance attributes +(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details +about PI-futexes.] + +This technology was developed in the -rt tree and streamlined for +pthread_mutex support. + +Basic principles: +----------------- + +RT-mutexes extend the semantics of simple mutexes by the priority +inheritance protocol. + +A low priority owner of a rt-mutex inherits the priority of a higher +priority waiter until the rt-mutex is released. If the temporarily +boosted owner blocks on a rt-mutex itself it propagates the priority +boosting to the owner of the other rt_mutex it gets blocked on. The +priority boosting is immediately removed once the rt_mutex has been +unlocked. + +This approach allows us to shorten the block of high-prio tasks on +mutexes which protect shared resources. Priority inheritance is not a +magic bullet for poorly designed applications, but it allows +well-designed applications to use userspace locks in critical parts of +an high priority thread, without losing determinism. + +The enqueueing of the waiters into the rtmutex waiter list is done in +priority order. For same priorities FIFO order is chosen. For each +rtmutex, only the top priority waiter is enqueued into the owner's +priority waiters list. This list too queues in priority order. Whenever +the top priority waiter of a task changes (for example it timed out or +got a signal), the priority of the owner task is readjusted. [The +priority enqueueing is handled by "plists", see include/linux/plist.h +for more details.] + +RT-mutexes are optimized for fastpath operations and have no internal +locking overhead when locking an uncontended mutex or unlocking a mutex +without waiters. The optimized fastpath operations require cmpxchg +support. [If that is not available then the rt-mutex internal spinlock +is used] + +The state of the rt-mutex is tracked via the owner field of the rt-mutex +structure: + +rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 +are used to keep track of the "owner is pending" and "rtmutex has +waiters" state. + + owner bit1 bit0 + NULL 0 0 mutex is free (fast acquire possible) + NULL 0 1 invalid state + NULL 1 0 Transitional state* + NULL 1 1 invalid state + taskpointer 0 0 mutex is held (fast release possible) + taskpointer 0 1 task is pending owner + taskpointer 1 0 mutex is held and has waiters + taskpointer 1 1 task is pending owner and mutex has waiters + +Pending-ownership handling is a performance optimization: +pending-ownership is assigned to the first (highest priority) waiter of +the mutex, when the mutex is released. The thread is woken up and once +it starts executing it can acquire the mutex. Until the mutex is taken +by it (bit 0 is cleared) a competing higher priority thread can "steal" +the mutex which puts the woken up thread back on the waiters list. + +The pending-ownership optimization is especially important for the +uninterrupted workflow of high-prio tasks which repeatedly +takes/releases locks that have lower-prio waiters. Without this +optimization the higher-prio thread would ping-pong to the lower-prio +task [because at unlock time we always assign a new owner]. + +(*) The "mutex has waiters" bit gets set to take the lock. If the lock +doesn't already have an owner, this bit is quickly cleared if there are +no waiters. So this is a transitional state to synchronize with looking +at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 558b8336855..254c507a608 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -481,7 +481,7 @@ register_cpus(void) struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; - register_cpu(p, i, NULL); + register_cpu(p, i); } return 0; } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 9fc9af88c60..093ccba0503 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -808,7 +808,7 @@ static int __init topology_init(void) int cpu; for_each_possible_cpu(cpu) - register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu, NULL); + register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu); return 0; } diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 47c08bcd9b2..3bb221db164 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -233,7 +233,7 @@ config NR_CPUS config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" - depends on SMP + depends on X86_HT help SMT scheduler support improves the CPU scheduler's decision making when dealing with Intel Pentium 4 chips with HyperThreading at a @@ -242,7 +242,7 @@ config SCHED_SMT config SCHED_MC bool "Multi-core scheduler support" - depends on SMP + depends on X86_HT default y help Multi-core scheduler support improves the CPU scheduler's decision @@ -780,6 +780,17 @@ config HOTPLUG_CPU enable suspend on SMP systems. CPUs can be controlled through /sys/devices/system/cpu. +config COMPAT_VDSO + bool "Compat VDSO support" + default y + help + Map the VDSO to the predictable old-style address too. + ---help--- + Say N here if you are running a sufficiently recent glibc + version (2.3.3 or later), to remove the high-mapped + VDSO mapping and to exclusively use the randomized VDSO. + + If unsure, say Y. endmenu diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 1c3a809e642..c80271f8f08 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -14,6 +14,7 @@ #include <asm/fixmap.h> #include <asm/processor.h> #include <asm/thread_info.h> +#include <asm/elf.h> #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -54,6 +55,7 @@ void foo(void) OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); @@ -69,7 +71,7 @@ void foo(void) sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); + DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); } diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index fd0457c9c82..e6a2d6b80cd 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -235,10 +235,10 @@ static void __init init_amd(struct cpuinfo_x86 *c) while ((1 << bits) < c->x86_max_cores) bits++; } - cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1); - phys_proc_id[cpu] >>= bits; + c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1); + c->phys_proc_id >>= bits; printk(KERN_INFO "CPU %d(%d) -> Core %d\n", - cpu, c->x86_max_cores, cpu_core_id[cpu]); + cpu, c->x86_max_cores, c->cpu_core_id); } #endif diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 44f2c5f2dda..70c87de582c 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -294,7 +294,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c) if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); #else c->apicid = (ebx >> 24) & 0xFF; @@ -319,7 +319,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c) early_intel_workaround(c); #ifdef CONFIG_X86_HT - phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif } @@ -477,11 +477,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; int index_msb, core_bits; - int cpu = smp_processor_id(); cpuid(1, &eax, &ebx, &ecx, &edx); - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; @@ -492,16 +490,17 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } else if (smp_num_siblings > 1 ) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of the " + "siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } index_msb = get_count_order(smp_num_siblings); - phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); + c->phys_proc_id); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -509,12 +508,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & ((1 << core_bits) - 1); if (c->x86_max_cores > 1) printk(KERN_INFO "CPU: Processor Core ID: %d\n", - cpu_core_id[cpu]); + c->cpu_core_id); } } #endif @@ -613,6 +612,12 @@ void __cpuinit cpu_init(void) set_in_cr4(X86_CR4_TSD); } + /* The CPU hotplug case */ + if (cpu_gdt_descr->address) { + gdt = (struct desc_struct *)cpu_gdt_descr->address; + memset(gdt, 0, PAGE_SIZE); + goto old_gdt; + } /* * This is a horrible hack to allocate the GDT. The problem * is that cpu_init() is called really early for the boot CPU @@ -631,7 +636,7 @@ void __cpuinit cpu_init(void) local_irq_enable(); } } - +old_gdt: /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index 6c37b4fd8ce..e9f0b928b0a 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -159,13 +159,13 @@ union l2_cache { unsigned val; }; -static unsigned short assocs[] = { +static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, [8] = 16, [0xf] = 0xffff // ?? }; -static unsigned char levels[] = { 1, 1, 2 }; -static unsigned char types[] = { 1, 2, 3 }; +static const unsigned char levels[] = { 1, 1, 2 }; +static const unsigned char types[] = { 1, 2, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -261,7 +261,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data); #endif @@ -383,14 +383,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT cpu_llc_id[cpu] = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT cpu_llc_id[cpu] = l3_id; #endif } @@ -729,7 +729,7 @@ static void __cpuexit cache_remove_dev(struct sys_device * sys_dev) return; } -static int cacheinfo_cpu_callback(struct notifier_block *nfb, +static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -747,7 +747,7 @@ static int cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cacheinfo_cpu_notifier = +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index a19fcb262db..f54a15268ed 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -18,7 +18,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) * applications want to get the raw CPUID data, they should access * /dev/cpu/<cpu_nr>/cpuid instead. */ - static char *x86_cap_flags[] = { + static const char * const x86_cap_flags[] = { /* Intel-defined */ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", @@ -62,7 +62,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; - static char *x86_power_flags[] = { + static const char * const x86_power_flags[] = { "ts", /* temperature sensor */ "fid", /* frequency id control */ "vid", /* voltage id control */ @@ -109,9 +109,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); #ifdef CONFIG_X86_HT if (c->x86_max_cores * smp_num_siblings > 1) { - seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]); + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n])); - seq_printf(m, "core id\t\t: %d\n", cpu_core_id[n]); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index 1d9a4abcdfc..f6dfa9fb675 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -183,7 +183,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac return NOTIFY_OK; } -static struct notifier_block cpuid_class_cpu_notifier = +static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = { .notifier_call = cpuid_class_cpu_callback, }; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index e6e4506e749..fbdb933251b 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -83,6 +83,12 @@ VM_MASK = 0x00020000 #define resume_kernel restore_nocheck #endif +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + #define SAVE_ALL \ cld; \ pushl %es; \ @@ -211,6 +217,7 @@ ret_from_exception: preempt_stop ret_from_intr: GET_THREAD_INFO(%ebp) +check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al testl $(VM_MASK | 3), %eax @@ -263,7 +270,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ - pushl $SYSENTER_RETURN + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 @@ -415,7 +427,7 @@ work_notifysig: # deal with pending signals and # vm86-space xorl %edx, %edx call do_notify_resume - jmp resume_userspace + jmp resume_userspace_sig ALIGN work_notifysig_v86: @@ -428,7 +440,7 @@ work_notifysig_v86: movl %eax, %esp xorl %edx, %edx call do_notify_resume - jmp resume_userspace + jmp resume_userspace_sig #endif # perform syscall exit tracing @@ -515,7 +527,7 @@ ENTRY(irq_entries_start) .if vector CFI_ADJUST_CFA_OFFSET -4 .endif -1: pushl $vector-256 +1: pushl $~(vector) CFI_ADJUST_CFA_OFFSET 4 jmp common_interrupt .data @@ -535,7 +547,7 @@ common_interrupt: #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ RING0_INT_FRAME; \ - pushl $nr-256; \ + pushl $~(nr); \ CFI_ADJUST_CFA_OFFSET 4; \ SAVE_ALL; \ movl %esp,%eax; \ diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 061533e0cb5..c703bc7b088 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -53,8 +53,8 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; */ fastcall unsigned int do_IRQ(struct pt_regs *regs) { - /* high bits used in ret_from_ code */ - int irq = regs->orig_eax & 0xff; + /* high bit used in ret_from_ code */ + int irq = ~regs->orig_eax; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; u32 *isp; @@ -100,8 +100,8 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) * softirq checks work in the hardirq context. */ irqctx->tinfo.preempt_count = - irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK | - curctx->tinfo.preempt_count & SOFTIRQ_MASK; + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( " xchgl %%ebx,%%esp \n" diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index 7a328230e54..d022cb8fd72 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -266,7 +266,7 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long acti return NOTIFY_OK; } -static struct notifier_block msr_class_cpu_notifier = +static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c index 321f5fd26e7..9bf590cefc7 100644 --- a/arch/i386/kernel/scx200.c +++ b/arch/i386/kernel/scx200.c @@ -9,6 +9,7 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/mutex.h> #include <linux/pci.h> #include <linux/scx200.h> @@ -45,11 +46,19 @@ static struct pci_driver scx200_pci_driver = { .probe = scx200_probe, }; -static DEFINE_SPINLOCK(scx200_gpio_config_lock); +static DEFINE_MUTEX(scx200_gpio_config_lock); -static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +static void __devinit scx200_init_shadow(void) { int bank; + + /* read the current values driven on the GPIO signals */ + for (bank = 0; bank < 2; ++bank) + scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); +} + +static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ unsigned base; if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || @@ -63,10 +72,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_ } scx200_gpio_base = base; - - /* read the current values driven on the GPIO signals */ - for (bank = 0; bank < 2; ++bank) - scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); + scx200_init_shadow(); } else { /* find the base of the Configuration Block */ @@ -87,12 +93,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_ return 0; } -u32 scx200_gpio_configure(int index, u32 mask, u32 bits) +u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) { u32 config, new_config; - unsigned long flags; - spin_lock_irqsave(&scx200_gpio_config_lock, flags); + mutex_lock(&scx200_gpio_config_lock); outl(index, scx200_gpio_base + 0x20); config = inl(scx200_gpio_base + 0x24); @@ -100,45 +105,11 @@ u32 scx200_gpio_configure(int index, u32 mask, u32 bits) new_config = (config & mask) | bits; outl(new_config, scx200_gpio_base + 0x24); - spin_unlock_irqrestore(&scx200_gpio_config_lock, flags); + mutex_unlock(&scx200_gpio_config_lock); return config; } -#if 0 -void scx200_gpio_dump(unsigned index) -{ - u32 config = scx200_gpio_configure(index, ~0, 0); - printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config); - - if (config & 1) - printk(" OE"); /* output enabled */ - else - printk(" TS"); /* tristate */ - if (config & 2) - printk(" PP"); /* push pull */ - else - printk(" OD"); /* open drain */ - if (config & 4) - printk(" PUE"); /* pull up enabled */ - else - printk(" PUD"); /* pull up disabled */ - if (config & 8) - printk(" LOCKED"); /* locked */ - if (config & 16) - printk(" LEVEL"); /* level input */ - else - printk(" EDGE"); /* edge input */ - if (config & 32) - printk(" HI"); /* trigger on rising edge */ - else - printk(" LO"); /* trigger on falling edge */ - if (config & 64) - printk(" DEBOUNCE"); /* debounce */ - printk("\n"); -} -#endif /* 0 */ - static int __init scx200_init(void) { printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); @@ -159,10 +130,3 @@ EXPORT_SYMBOL(scx200_gpio_base); EXPORT_SYMBOL(scx200_gpio_shadow); EXPORT_SYMBOL(scx200_gpio_configure); EXPORT_SYMBOL(scx200_cb_base); - -/* - Local variables: - compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules" - c-basic-offset: 8 - End: -*/ diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 5c352c3a9e7..43002cfb40c 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = &__kernel_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = &__kernel_rt_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index bce5470ecb4..89e7315e539 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -67,12 +67,6 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); #endif -/* Package ID of each logical CPU */ -int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; - -/* Core ID of each logical CPU */ -int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; - /* Last level cache ID of each logical CPU */ int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; @@ -454,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu) struct cpuinfo_x86 *c = cpu_data + cpu; /* * For perf, we return last level cache shared map. - * TBD: when power saving sched policy is added, we will return - * cpu_core_map when power saving policy is enabled + * And for power savings, we return cpu_core_map */ - return c->llc_shared_map; + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; } /* representing cpus for which sibling maps can be computed */ @@ -473,8 +469,8 @@ set_cpu_sibling_map(int cpu) if (smp_num_siblings > 1) { for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (phys_proc_id[cpu] == phys_proc_id[i] && - cpu_core_id[cpu] == cpu_core_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id && + c[cpu].cpu_core_id == c[i].cpu_core_id) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); @@ -501,7 +497,7 @@ set_cpu_sibling_map(int cpu) cpu_set(i, c[cpu].llc_shared_map); cpu_set(cpu, c[i].llc_shared_map); } - if (phys_proc_id[cpu] == phys_proc_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); /* @@ -1056,6 +1052,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu) struct warm_boot_cpu_info info; struct work_struct task; int apicid, ret; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -1063,6 +1060,18 @@ static int __cpuinit __smp_prepare_cpu(int cpu) goto exit; } + /* + * the CPU isn't initialized at boot time, allocate gdt table here. + * cpu_init will initialize it + */ + if (!cpu_gdt_descr->address) { + cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); + if (!cpu_gdt_descr->address) + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + ret = -ENOMEM; + goto exit; + } + info.complete = &done; info.apicid = apicid; info.cpu = cpu; @@ -1340,8 +1349,8 @@ remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; + c[cpu].phys_proc_id = 0; + c[cpu].cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); } diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 0bada1870bd..c60419dee01 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -2,6 +2,8 @@ * linux/arch/i386/kernel/sysenter.c * * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar * * This file contains the needed initializations to support sysenter. */ @@ -13,12 +15,31 @@ #include <linux/gfp.h> #include <linux/string.h> #include <linux/elf.h> +#include <linux/mm.h> +#include <linux/module.h> #include <asm/cpufeature.h> #include <asm/msr.h> #include <asm/pgtable.h> #include <asm/unistd.h> +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = 1; + +EXPORT_SYMBOL_GPL(vdso_enabled); + +static int __init vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + + return 1; +} + +__setup("vdso=", vdso_setup); + extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void) @@ -45,23 +66,122 @@ void enable_sep_cpu(void) */ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; +static void *syscall_page; int __init sysenter_setup(void) { - void *page = (void *)get_zeroed_page(GFP_ATOMIC); + syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); +#ifdef CONFIG_COMPAT_VDSO + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#else + /* + * In the non-compat case the ELF coredumping code needs the fixmap: + */ + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO); +#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(page, + memcpy(syscall_page, &vsyscall_int80_start, &vsyscall_int80_end - &vsyscall_int80_start); return 0; } - memcpy(page, + memcpy(syscall_page, &vsyscall_sysenter_start, &vsyscall_sysenter_end - &vsyscall_sysenter_start); return 0; } + +static struct page *syscall_nopage(struct vm_area_struct *vma, + unsigned long adr, int *type) +{ + struct page *p = virt_to_page(adr - vma->vm_start + syscall_page); + get_page(p); + return p; +} + +/* Prevent VMA merging */ +static void syscall_vma_close(struct vm_area_struct *vma) +{ +} + +static struct vm_operations_struct syscall_vm_ops = { + .close = syscall_vma_close, + .nopage = syscall_nopage, +}; + +/* Defined in vsyscall-sysenter.S */ +extern void SYSENTER_RETURN; + +/* Setup a VMA at program startup for the vsyscall page */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + + down_write(&mm->mmap_sem); + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + ret = -ENOMEM; + goto up_fail; + } + + vma->vm_start = addr; + vma->vm_end = addr + PAGE_SIZE; + /* MAYWRITE to allow gdb to COW and set breakpoints */ + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; + vma->vm_flags |= mm->def_flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall_vm_ops; + vma->vm_mm = mm; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto free_vma; + + current->mm->context.vdso = (void *)addr; + current_thread_info()->sysenter_return = + (void *)VDSO_SYM(&SYSENTER_RETURN); + mm->total_vm++; +up_fail: + up_write(&mm->mmap_sem); + return ret; + +free_vma: + kmem_cache_free(vm_area_cachep, vma); + return ret; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + return NULL; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + return 0; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 296355292c7..e2e281d4bcc 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -32,15 +32,8 @@ static struct i386_cpu cpu_devices[NR_CPUS]; -int arch_register_cpu(int num){ - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - int node = cpu_to_node(num); - if (node_online(node)) - parent = &node_devices[node].node; -#endif /* CONFIG_NUMA */ - +int arch_register_cpu(int num) +{ /* * CPU0 cannot be offlined due to several * restrictions and assumptions in kernel. This basically @@ -50,21 +43,13 @@ int arch_register_cpu(int num){ if (!num) cpu_devices[num].cpu.no_control = 1; - return register_cpu(&cpu_devices[num].cpu, num, parent); + return register_cpu(&cpu_devices[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU void arch_unregister_cpu(int num) { - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - int node = cpu_to_node(num); - if (node_online(node)) - parent = &node_devices[node].node; -#endif /* CONFIG_NUMA */ - - return unregister_cpu(&cpu_devices[num].cpu, parent); + return unregister_cpu(&cpu_devices[num].cpu); } EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); @@ -74,16 +59,13 @@ EXPORT_SYMBOL(arch_unregister_cpu); #ifdef CONFIG_NUMA #include <linux/mmzone.h> -#include <asm/node.h> - -struct i386_node node_devices[MAX_NUMNODES]; static int __init topology_init(void) { int i; for_each_online_node(i) - arch_register_node(i); + register_one_node(i); for_each_present_cpu(i) arch_register_cpu(i); diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S index 3b62baa6a37..1a36d26e15e 100644 --- a/arch/i386/kernel/vsyscall-sysenter.S +++ b/arch/i386/kernel/vsyscall-sysenter.S @@ -42,10 +42,10 @@ __kernel_vsyscall: /* 7: align return point with nop's to make disassembly easier */ .space 7,0x90 - /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by entry.S. */ + .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ SYSENTER_RETURN: pop %ebp .Lpop_ebp: diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index 98699ca6e52..e26975fc68b 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,7 +7,7 @@ SECTIONS { - . = VSYSCALL_BASE + SIZEOF_HEADERS; + . = VDSO_PRELINK + SIZEOF_HEADERS; .hash : { *(.hash) } :text .dynsym : { *(.dynsym) } @@ -20,7 +20,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; + . = VDSO_PRELINK + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note diff --git a/arch/i386/mach-voyager/setup.c b/arch/i386/mach-voyager/setup.c index 0e225054e22..defc6ebbd56 100644 --- a/arch/i386/mach-voyager/setup.c +++ b/arch/i386/mach-voyager/setup.c @@ -5,10 +5,10 @@ #include <linux/config.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/acpi.h> #include <asm/arch_hooks.h> #include <asm/voyager.h> #include <asm/e820.h> +#include <asm/io.h> #include <asm/setup.h> void __init pre_intr_init_hook(void) @@ -27,8 +27,7 @@ void __init intr_init_hook(void) smp_intr_init(); #endif - if (!acpi_ioapic) - setup_irq(2, &irq2); + setup_irq(2, &irq2); } void __init pre_setup_arch_hook(void) diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index bf19513f0ce..f84b16e007f 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/poison.h> #include <linux/bootmem.h> #include <linux/slab.h> #include <linux/proc_fs.h> @@ -654,7 +655,7 @@ void __init mem_init(void) */ #ifdef CONFIG_MEMORY_HOTPLUG #ifndef CONFIG_NEED_MULTIPLE_NODES -int add_memory(u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata = &contig_page_data; struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; @@ -753,7 +754,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)addr, 0xcc, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 0887b34bc59..353a836ed63 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -229,8 +229,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) if (PageHighMem(page)) return; if (!enable) - mutex_debug_check_no_locks_freed(page_address(page), - numpages * PAGE_SIZE); + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); /* the return value is ignored - the calls cannot fail, * large pages are disabled at boot time. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 18318749884..a56df7bf022 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -374,6 +374,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool y depends on NEED_MULTIPLE_NODES +config HAVE_ARCH_NODEDATA_EXTENSION + def_bool y + depends on NUMA + config IA32_SUPPORT bool "Support for Linux/x86 binaries" help diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c index 859fb37ff49..303a9afcf2a 100644 --- a/arch/ia64/kernel/palinfo.c +++ b/arch/ia64/kernel/palinfo.c @@ -959,7 +959,7 @@ remove_palinfo_proc_entries(unsigned int hcpu) } } -static int palinfo_cpu_callback(struct notifier_block *nfb, +static int __cpuinit palinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -978,7 +978,7 @@ static int palinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block palinfo_cpu_notifier = +static struct notifier_block __cpuinitdata palinfo_cpu_notifier = { .notifier_call = palinfo_cpu_callback, .priority = 0, diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index 663a186ad19..9065f0f01ba 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -572,7 +572,7 @@ static struct file_operations salinfo_data_fops = { }; #ifdef CONFIG_HOTPLUG_CPU -static int +static int __devinit salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int i, cpu = (unsigned long)hcpu; @@ -673,9 +673,7 @@ salinfo_init(void) salinfo_timer.function = &salinfo_timeout; add_timer(&salinfo_timer); -#ifdef CONFIG_HOTPLUG_CPU - register_cpu_notifier(&salinfo_cpu_notifier); -#endif + register_hotcpu_notifier(&salinfo_cpu_notifier); return 0; } diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 879edb51d1e..5511d9c6c70 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -26,19 +26,10 @@ #include <asm/numa.h> #include <asm/cpu.h> -#ifdef CONFIG_NUMA -static struct node *sysfs_nodes; -#endif static struct ia64_cpu *sysfs_cpus; int arch_register_cpu(int num) { - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - parent = &sysfs_nodes[cpu_to_node(num)]; -#endif /* CONFIG_NUMA */ - #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU) /* * If CPEI cannot be re-targetted, and this is @@ -48,21 +39,14 @@ int arch_register_cpu(int num) sysfs_cpus[num].cpu.no_control = 1; #endif - return register_cpu(&sysfs_cpus[num].cpu, num, parent); + return register_cpu(&sysfs_cpus[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU void arch_unregister_cpu(int num) { - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - int node = cpu_to_node(num); - parent = &sysfs_nodes[node]; -#endif /* CONFIG_NUMA */ - - return unregister_cpu(&sysfs_cpus[num].cpu, parent); + return unregister_cpu(&sysfs_cpus[num].cpu); } EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); @@ -74,17 +58,11 @@ static int __init topology_init(void) int i, err = 0; #ifdef CONFIG_NUMA - sysfs_nodes = kzalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL); - if (!sysfs_nodes) { - err = -ENOMEM; - goto out; - } - /* * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? */ for_each_online_node(i) { - if ((err = register_node(&sysfs_nodes[i], i, 0))) + if ((err = register_one_node(i))) goto out; } #endif @@ -426,7 +404,7 @@ static int __cpuinit cache_remove_dev(struct sys_device * sys_dev) * When a cpu is hot-plugged, do a check and initiate * cache kobject if necessary */ -static int cache_cpu_callback(struct notifier_block *nfb, +static int __cpuinit cache_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -444,7 +422,7 @@ static int cache_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cache_cpu_notifier = +static struct notifier_block __cpuinitdata cache_cpu_notifier = { .notifier_call = cache_cpu_callback }; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index b6bcc9fa360..525b082eb66 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -33,7 +33,6 @@ */ struct early_node_data { struct ia64_node_data *node_data; - pg_data_t *pgdat; unsigned long pernode_addr; unsigned long pernode_size; struct bootmem_data bootmem_data; @@ -46,6 +45,8 @@ struct early_node_data { static struct early_node_data mem_data[MAX_NUMNODES] __initdata; static nodemask_t memory_less_mask __initdata; +static pg_data_t *pgdat_list[MAX_NUMNODES]; + /* * To prevent cache aliasing effects, align per-node structures so that they * start at addresses that are strided by node number. @@ -99,7 +100,7 @@ static int __init build_node_maps(unsigned long start, unsigned long len, * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been * called yet. Note that node 0 will also count all non-existent cpus. */ -static int __init early_nr_cpus_node(int node) +static int __meminit early_nr_cpus_node(int node) { int cpu, n = 0; @@ -114,7 +115,7 @@ static int __init early_nr_cpus_node(int node) * compute_pernodesize - compute size of pernode data * @node: the node id. */ -static unsigned long __init compute_pernodesize(int node) +static unsigned long __meminit compute_pernodesize(int node) { unsigned long pernodesize = 0, cpus; @@ -175,13 +176,13 @@ static void __init fill_pernode(int node, unsigned long pernode, pernode += PERCPU_PAGE_SIZE * cpus; pernode += node * L1_CACHE_BYTES; - mem_data[node].pgdat = __va(pernode); + pgdat_list[node] = __va(pernode); pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); mem_data[node].node_data = __va(pernode); pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); - mem_data[node].pgdat->bdata = bdp; + pgdat_list[node]->bdata = bdp; pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); cpu_data = per_cpu_node_setup(cpu_data, node); @@ -268,7 +269,7 @@ static int __init find_pernode_space(unsigned long start, unsigned long len, static int __init free_node_bootmem(unsigned long start, unsigned long len, int node) { - free_bootmem_node(mem_data[node].pgdat, start, len); + free_bootmem_node(pgdat_list[node], start, len); return 0; } @@ -287,7 +288,7 @@ static void __init reserve_pernode_space(void) int node; for_each_online_node(node) { - pg_data_t *pdp = mem_data[node].pgdat; + pg_data_t *pdp = pgdat_list[node]; if (node_isset(node, memory_less_mask)) continue; @@ -307,6 +308,17 @@ static void __init reserve_pernode_space(void) } } +static void __meminit scatter_node_data(void) +{ + pg_data_t **dst; + int node; + + for_each_online_node(node) { + dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs; + memcpy(dst, pgdat_list, sizeof(pgdat_list)); + } +} + /** * initialize_pernode_data - fixup per-cpu & per-node pointers * @@ -317,17 +329,10 @@ static void __init reserve_pernode_space(void) */ static void __init initialize_pernode_data(void) { - pg_data_t *pgdat_list[MAX_NUMNODES]; int cpu, node; - for_each_online_node(node) - pgdat_list[node] = mem_data[node].pgdat; + scatter_node_data(); - /* Copy the pg_data_t list to each node and init the node field */ - for_each_online_node(node) { - memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list, - sizeof(pgdat_list)); - } #ifdef CONFIG_SMP /* Set the node_data pointer for each per-cpu struct */ for (cpu = 0; cpu < NR_CPUS; cpu++) { @@ -372,7 +377,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) if (bestnode == -1) bestnode = anynode; - ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, pernodesize, + ptr = __alloc_bootmem_node(pgdat_list[bestnode], pernodesize, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); return ptr; @@ -476,7 +481,7 @@ void __init find_memory(void) pernodesize = mem_data[node].pernode_size; map = pernode + pernodesize; - init_bootmem_node(mem_data[node].pgdat, + init_bootmem_node(pgdat_list[node], map>>PAGE_SHIFT, bdp->node_boot_start>>PAGE_SHIFT, bdp->node_low_pfn); @@ -786,3 +791,21 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } + +pg_data_t *arch_alloc_nodedata(int nid) +{ + unsigned long size = compute_pernodesize(nid); + + return kzalloc(size, GFP_KERNEL); +} + +void arch_free_nodedata(pg_data_t *pgdat) +{ + kfree(pgdat); +} + +void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) +{ + pgdat_list[update_node] = update_pgdat; + scatter_node_data(); +} diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 11f08001f8c..38306e98f04 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -652,7 +652,7 @@ void online_page(struct page *page) num_physpages++; } -int add_memory(u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat; struct zone *zone; @@ -660,7 +660,7 @@ int add_memory(u64 start, u64 size) unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(0); + pgdat = NODE_DATA(nid); zone = pgdat->node_zones + ZONE_NORMAL; ret = __add_pages(zone, start_pfn, nr_pages); @@ -671,7 +671,6 @@ int add_memory(u64 start, u64 size) return ret; } -EXPORT_SYMBOL_GPL(add_memory); int remove_memory(u64 start, u64 size) { diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index dc8e2b69671..677c6c0fd66 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -27,7 +27,7 @@ static void unregister_intr_pda(struct sn_irq_info *sn_irq_info); int sn_force_interrupt_flag = 1; extern int sn_ioif_inited; struct list_head **sn_irq_lh; -static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */ +static DEFINE_SPINLOCK(sn_irq_info_lock); /* non-IRQ lock */ u64 sn_intr_alloc(nasid_t local_nasid, int local_widget, struct sn_irq_info *sn_irq_info, diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c index 3cd3c2988a4..1ff483c8a4c 100644 --- a/arch/m32r/kernel/setup.c +++ b/arch/m32r/kernel/setup.c @@ -275,7 +275,7 @@ static int __init topology_init(void) int i; for_each_present_cpu(i) - register_cpu(&cpu_devices[i], i, NULL); + register_cpu(&cpu_devices[i], i); return 0; } diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 298f82fe844..9096a5ea422 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -446,7 +446,7 @@ static int __init topology_init(void) int ret; for_each_present_cpu(cpu) { - ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL); + ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu); if (ret) printk(KERN_WARNING "topology_init: register_cpu %d " "failed (%d)\n", cpu, ret); diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 2e8e52c135e..70cf09afdf5 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -367,7 +367,7 @@ void mipsmt_prepare_cpus(void) dvpe(); dmt(); - freeIPIq.lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&freeIPIq.lock); /* * We probably don't have as many VPEs as we do SMP "CPUs", @@ -375,7 +375,7 @@ void mipsmt_prepare_cpus(void) */ for (i=0; i<NR_CPUS; i++) { IPIQ[i].head = IPIQ[i].tail = NULL; - IPIQ[i].lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&IPIQ[i].lock); IPIQ[i].depth = 0; ipi_timer_latch[i] = 0; } diff --git a/arch/parisc/kernel/topology.c b/arch/parisc/kernel/topology.c index 3ba040050e4..068b20d822e 100644 --- a/arch/parisc/kernel/topology.c +++ b/arch/parisc/kernel/topology.c @@ -26,11 +26,10 @@ static struct cpu cpu_devices[NR_CPUS] __read_mostly; static int __init topology_init(void) { - struct node *parent = NULL; int num; for_each_present_cpu(num) { - register_cpu(&cpu_devices[num], num, parent); + register_cpu(&cpu_devices[num], num); } return 0; } diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index e5a44812441..0932a62a1c9 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -215,7 +215,7 @@ int __init ppc_init(void) /* register CPU devices */ for_each_possible_cpu(i) - register_cpu(&cpu_devices[i], i, NULL); + register_cpu(&cpu_devices[i], i); /* call platform init */ if (ppc_md.init != NULL) { diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 5bc2585c803..4662b580efa 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -279,7 +279,7 @@ static void unregister_cpu_online(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int sysfs_cpu_notify(struct notifier_block *self, +static int __devinit sysfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; @@ -297,30 +297,19 @@ static int sysfs_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block sysfs_cpu_nb = { +static struct notifier_block __devinitdata sysfs_cpu_nb = { .notifier_call = sysfs_cpu_notify, }; /* NUMA stuff */ #ifdef CONFIG_NUMA -static struct node node_devices[MAX_NUMNODES]; - static void register_nodes(void) { int i; - for (i = 0; i < MAX_NUMNODES; i++) { - if (node_online(i)) { - int p_node = parent_node(i); - struct node *parent = NULL; - - if (p_node != i) - parent = &node_devices[p_node]; - - register_node(&node_devices[i], i, parent); - } - } + for (i = 0; i < MAX_NUMNODES; i++) + register_one_node(i); } int sysfs_add_device_to_node(struct sys_device *dev, int nid) @@ -359,23 +348,13 @@ static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL); static int __init topology_init(void) { int cpu; - struct node *parent = NULL; register_nodes(); - register_cpu_notifier(&sysfs_cpu_nb); for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); -#ifdef CONFIG_NUMA - /* The node to which a cpu belongs can't be known - * until the cpu is made present. - */ - parent = NULL; - if (cpu_present(cpu)) - parent = &node_devices[cpu_to_node(cpu)]; -#endif /* * For now, we just see if the system supports making * the RTAS calls for CPU hotplug. But, there may be a @@ -387,7 +366,7 @@ static int __init topology_init(void) c->no_control = 1; if (cpu_online(cpu) || (c->no_control == 0)) { - register_cpu(c, cpu, parent); + register_cpu(c, cpu); sysdev_create_file(&c->sysdev, &attr_physical_id); } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 9e30f968c18..d454caada26 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -41,6 +41,7 @@ #include <linux/idr.h> #include <linux/nodemask.h> #include <linux/module.h> +#include <linux/poison.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -90,7 +91,7 @@ void free_initmem(void) addr = (unsigned long)__init_begin; for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { - memset((void *)addr, 0xcc, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); free_page(addr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 69f3b9a20be..089d939a0b3 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -114,15 +114,20 @@ void online_page(struct page *page) num_physpages++; } -int __devinit add_memory(u64 start, u64 size) +#ifdef CONFIG_NUMA +int memory_add_physaddr_to_nid(u64 start) +{ + return hot_add_scn_to_nid(start); +} +#endif + +int __devinit arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata; struct zone *zone; - int nid; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - nid = hot_add_scn_to_nid(start); pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index aa98cb3b59d..fbe23933f73 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -334,7 +334,7 @@ out: return nid; } -static int cpu_numa_callback(struct notifier_block *nfb, +static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -609,14 +609,15 @@ static void __init *careful_allocation(int nid, unsigned long size, return (void *)ret; } +static struct notifier_block __cpuinitdata ppc64_numa_nb = { + .notifier_call = cpu_numa_callback, + .priority = 1 /* Must run before sched domains notifier. */ +}; + void __init do_init_bootmem(void) { int nid; unsigned int i; - static struct notifier_block ppc64_numa_nb = { - .notifier_call = cpu_numa_callback, - .priority = 1 /* Must run before sched domains notifier. */ - }; min_low_pfn = 0; max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c index 3068b429b03..a656d810a44 100644 --- a/arch/powerpc/platforms/cell/spufs/switch.c +++ b/arch/powerpc/platforms/cell/spufs/switch.c @@ -2203,7 +2203,7 @@ void spu_init_csa(struct spu_state *csa) memset(lscsa, 0, sizeof(struct spu_lscsa)); csa->lscsa = lscsa; - csa->register_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&csa->register_lock); /* Set LS pages reserved to allow for user-space mapping. */ for (p = lscsa->ls; p < lscsa->ls + LS_SIZE; p += PAGE_SIZE) diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c index 047f954a89e..93e7505debc 100644 --- a/arch/powerpc/platforms/powermac/pfunc_core.c +++ b/arch/powerpc/platforms/powermac/pfunc_core.c @@ -546,7 +546,7 @@ struct pmf_device { }; static LIST_HEAD(pmf_devices); -static spinlock_t pmf_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(pmf_lock); static DEFINE_MUTEX(pmf_irq_mutex); static void pmf_release_device(struct kref *kref) diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index 8f2d12935b9..45ccc687e57 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -35,7 +35,7 @@ */ /* EEH event workqueue setup. */ -static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(eeh_eventlist_lock); LIST_HEAD(eeh_eventlist); static void eeh_thread_launcher(void *); DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); diff --git a/arch/powerpc/sysdev/mmio_nvram.c b/arch/powerpc/sysdev/mmio_nvram.c index 74e0d31a355..615350d46b5 100644 --- a/arch/powerpc/sysdev/mmio_nvram.c +++ b/arch/powerpc/sysdev/mmio_nvram.c @@ -32,7 +32,7 @@ static void __iomem *mmio_nvram_start; static long mmio_nvram_len; -static spinlock_t mmio_nvram_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(mmio_nvram_lock); static ssize_t mmio_nvram_read(char *buf, size_t count, loff_t *index) { diff --git a/arch/ppc/kernel/setup.c b/arch/ppc/kernel/setup.c index 1f79e84ab46..4b4607d89bf 100644 --- a/arch/ppc/kernel/setup.c +++ b/arch/ppc/kernel/setup.c @@ -475,7 +475,7 @@ int __init ppc_init(void) /* register CPU devices */ for_each_possible_cpu(i) - register_cpu(&cpu_devices[i], i, NULL); + register_cpu(&cpu_devices[i], i); /* call platform init */ if (ppc_md.init != NULL) { diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index 9a22434a580..54d35c13090 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -652,7 +652,7 @@ appldata_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block appldata_nb = { +static struct notifier_block __devinitdata appldata_nb = { .notifier_call = appldata_cpu_notify, }; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 343120c9223..8e03219eea7 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -869,7 +869,7 @@ static int __init topology_init(void) int ret; for_each_possible_cpu(cpu) { - ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL); + ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu); if (ret) printk(KERN_WARNING "topology_init: register_cpu %d " "failed (%d)\n", cpu, ret); diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index bb229ef030f..9af22116c9a 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -402,7 +402,7 @@ static int __init topology_init(void) int cpu_id; for_each_possible_cpu(cpu_id) - register_cpu(&cpu[cpu_id], cpu_id, NULL); + register_cpu(&cpu[cpu_id], cpu_id); return 0; } diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c index d2711c9c9d1..da98d8dbcf9 100644 --- a/arch/sh64/kernel/setup.c +++ b/arch/sh64/kernel/setup.c @@ -309,7 +309,7 @@ static struct cpu cpu[1]; static int __init topology_init(void) { - return register_cpu(cpu, 0, NULL); + return register_cpu(cpu, 0); } subsys_initcall(topology_init); diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c index a6a7d816834..116d9632002 100644 --- a/arch/sparc64/kernel/setup.c +++ b/arch/sparc64/kernel/setup.c @@ -537,7 +537,7 @@ static int __init topology_init(void) for_each_possible_cpu(i) { struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL); if (p) { - register_cpu(p, i, NULL); + register_cpu(p, i); err = 0; } } diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 5c2bcf354ce..cb75a27adb5 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -18,6 +18,7 @@ #include <linux/initrd.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/poison.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kprobes.h> @@ -1520,7 +1521,7 @@ void free_initmem(void) page = (addr + ((unsigned long) __va(kern_base)) - ((unsigned long) KERNBASE)); - memset((void *)addr, 0xcc, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); p = virt_to_page(page); ClearPageReserved(p); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7290e72b9a3..22cac4487b5 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -588,7 +588,7 @@ END(common_interrupt) */ .macro apicinterrupt num,func INTR_FRAME - pushq $\num-256 + pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 interrupt \func jmp ret_from_intr diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 59518d4d435..3be0a7e4bf0 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -115,8 +115,8 @@ skip: */ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { - /* high bits used in ret_from_ code */ - unsigned irq = regs->orig_rax & 0xff; + /* high bit used in ret_from_ code */ + unsigned irq = ~regs->orig_rax; exit_idle(); irq_enter(); diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index acd5816b1a6..88845674c66 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -629,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) #endif /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int +static __cpuinit int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -647,7 +647,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier = { +static struct notifier_block __cpuinitdata mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index acee4bc3f6f..5a1c0a3bf87 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -135,10 +135,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) cpu = smp_processor_id(); /* - * orig_rax contains the interrupt vector - 256. + * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. */ - sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START; + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; f = &per_cpu(flush_state, sender); if (!cpu_isset(cpu, f->flush_cpumask)) diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 4e9755179ec..540c0ccbccc 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -455,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu) struct cpuinfo_x86 *c = cpu_data + cpu; /* * For perf, we return last level cache shared map. - * TBD: when power saving sched policy is added, we will return - * cpu_core_map when power saving policy is enabled + * And for power savings, we return cpu_core_map */ - return c->llc_shared_map; + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; } /* representing cpus for which sibling maps can be computed */ diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 02add1d1dfa..95bd232ff0c 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -23,6 +23,7 @@ #include <linux/bootmem.h> #include <linux/proc_fs.h> #include <linux/pci.h> +#include <linux/poison.h> #include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/memory_hotplug.h> @@ -506,8 +507,6 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) /* * Memory hotplug specific functions */ -#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) - void online_page(struct page *page) { ClearPageReserved(page); @@ -517,31 +516,17 @@ void online_page(struct page *page) num_physpages++; } -#ifndef CONFIG_MEMORY_HOTPLUG +#ifdef CONFIG_MEMORY_HOTPLUG /* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address + * via probe interface of sysfs. If acpi notifies hot-add event, then it + * can tell node id by searching dsdt. But, probe interface doesn't have + * node id. So, return 0 as node id at this time. */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +#ifdef CONFIG_NUMA +int memory_add_physaddr_to_nid(u64 start) { - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; + return 0; } #endif @@ -549,9 +534,9 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int add_memory(u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size) { - struct pglist_data *pgdat = NODE_DATA(0); + struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -568,7 +553,7 @@ error: printk("%s: Problem encountered in __add_pages!\n", __func__); return ret; } -EXPORT_SYMBOL_GPL(add_memory); +EXPORT_SYMBOL_GPL(arch_add_memory); int remove_memory(u64 start, u64 size) { @@ -576,7 +561,33 @@ int remove_memory(u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); -#endif +#else /* CONFIG_MEMORY_HOTPLUG */ +/* + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, + * just online the pages. + */ +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +{ + int err = -EIO; + unsigned long pfn; + unsigned long total = 0, mem = 0; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { + if (pfn_valid(pfn)) { + online_page(pfn_to_page(pfn)); + err = 0; + mem++; + } + total++; + } + if (!err) { + z->spanned_pages += total; + z->present_pages += mem; + z->zone_pgdat->node_spanned_pages += total; + z->zone_pgdat->node_present_pages += mem; + } + return err; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -650,7 +661,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } @@ -658,7 +670,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { - memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); + memset(__initdata_begin, POISON_FREE_INITDATA, + __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 937d81f62f4..fe14909f45e 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -29,7 +29,7 @@ extern volatile unsigned long wall_jiffies; -spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 225d64d73f0..27e409089a7 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -461,7 +461,7 @@ void show_code(unsigned int *pc) } } -spinlock_t die_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) { diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index c04422a502d..eee03a3876a 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3403,7 +3403,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action, } -static struct notifier_block blk_cpu_notifier = { +static struct notifier_block __devinitdata blk_cpu_notifier = { .notifier_call = blk_cpu_notify, }; @@ -3541,9 +3541,7 @@ int __init blk_dev_init(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); -#ifdef CONFIG_HOTPLUG_CPU - register_cpu_notifier(&blk_cpu_notifier); -#endif + register_hotcpu_notifier(&blk_cpu_notifier); blk_max_low_pfn = max_low_pfn; blk_max_pfn = max_pfn; diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 94b8d820c51..610d2cc02cf 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -328,7 +328,7 @@ config ACPI_CONTAINER config ACPI_HOTPLUG_MEMORY tristate "Memory Hotplug" depends on ACPI - depends on MEMORY_HOTPLUG || X86_64 + depends on MEMORY_HOTPLUG default n help This driver adds supports for ACPI Memory Hotplug. This driver diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index e0a95ba7237..1012284ff4f 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -57,6 +57,7 @@ MODULE_LICENSE("GPL"); static int acpi_memory_device_add(struct acpi_device *device); static int acpi_memory_device_remove(struct acpi_device *device, int type); +static int acpi_memory_device_start(struct acpi_device *device); static struct acpi_driver acpi_memory_device_driver = { .name = ACPI_MEMORY_DEVICE_DRIVER_NAME, @@ -65,48 +66,79 @@ static struct acpi_driver acpi_memory_device_driver = { .ops = { .add = acpi_memory_device_add, .remove = acpi_memory_device_remove, + .start = acpi_memory_device_start, }, }; +struct acpi_memory_info { + struct list_head list; + u64 start_addr; /* Memory Range start physical addr */ + u64 length; /* Memory Range length */ + unsigned short caching; /* memory cache attribute */ + unsigned short write_protect; /* memory read/write attribute */ + unsigned int enabled:1; +}; + struct acpi_memory_device { acpi_handle handle; unsigned int state; /* State of the memory device */ - unsigned short caching; /* memory cache attribute */ - unsigned short write_protect; /* memory read/write attribute */ - u64 start_addr; /* Memory Range start physical addr */ - u64 length; /* Memory Range length */ + struct list_head res_list; }; +static acpi_status +acpi_memory_get_resource(struct acpi_resource *resource, void *context) +{ + struct acpi_memory_device *mem_device = context; + struct acpi_resource_address64 address64; + struct acpi_memory_info *info, *new; + acpi_status status; + + status = acpi_resource_to_address64(resource, &address64); + if (ACPI_FAILURE(status) || + (address64.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + list_for_each_entry(info, &mem_device->res_list, list) { + /* Can we combine the resource range information? */ + if ((info->caching == address64.info.mem.caching) && + (info->write_protect == address64.info.mem.write_protect) && + (info->start_addr + info->length == address64.minimum)) { + info->length += address64.address_length; + return AE_OK; + } + } + + new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); + if (!new) + return AE_ERROR; + + INIT_LIST_HEAD(&new->list); + new->caching = address64.info.mem.caching; + new->write_protect = address64.info.mem.write_protect; + new->start_addr = address64.minimum; + new->length = address64.address_length; + list_add_tail(&new->list, &mem_device->res_list); + + return AE_OK; +} + static int acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) { acpi_status status; - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - struct acpi_resource *resource = NULL; - struct acpi_resource_address64 address64; + struct acpi_memory_info *info, *n; ACPI_FUNCTION_TRACE("acpi_memory_get_device_resources"); - /* Get the range from the _CRS */ - status = acpi_get_current_resources(mem_device->handle, &buffer); - if (ACPI_FAILURE(status)) - return_VALUE(-EINVAL); - - resource = (struct acpi_resource *)buffer.pointer; - status = acpi_resource_to_address64(resource, &address64); - if (ACPI_SUCCESS(status)) { - if (address64.resource_type == ACPI_MEMORY_RANGE) { - /* Populate the structure */ - mem_device->caching = address64.info.mem.caching; - mem_device->write_protect = - address64.info.mem.write_protect; - mem_device->start_addr = address64.minimum; - mem_device->length = address64.address_length; - } + status = acpi_walk_resources(mem_device->handle, METHOD_NAME__CRS, + acpi_memory_get_resource, mem_device); + if (ACPI_FAILURE(status)) { + list_for_each_entry_safe(info, n, &mem_device->res_list, list) + kfree(info); + return -EINVAL; } - acpi_os_free(buffer.pointer); - return_VALUE(0); + return 0; } static int @@ -181,7 +213,9 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device) static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) { - int result; + int result, num_enabled = 0; + struct acpi_memory_info *info; + int node; ACPI_FUNCTION_TRACE("acpi_memory_enable_device"); @@ -194,15 +228,35 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) return result; } + node = acpi_get_node(mem_device->handle); /* * Tell the VM there is more memory here... * Note: Assume that this function returns zero on success + * We don't have memory-hot-add rollback function,now. + * (i.e. memory-hot-remove function) */ - result = add_memory(mem_device->start_addr, mem_device->length); - if (result) { + list_for_each_entry(info, &mem_device->res_list, list) { + u64 start_pfn, end_pfn; + + start_pfn = info->start_addr >> PAGE_SHIFT; + end_pfn = (info->start_addr + info->length - 1) >> PAGE_SHIFT; + + if (pfn_valid(start_pfn) || pfn_valid(end_pfn)) { + /* already enabled. try next area */ + num_enabled++; + continue; + } + + result = add_memory(node, info->start_addr, info->length); + if (result) + continue; + info->enabled = 1; + num_enabled++; + } + if (!num_enabled) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); mem_device->state = MEMORY_INVALID_STATE; - return result; + return -EINVAL; } return result; @@ -246,8 +300,7 @@ static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device) static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) { int result; - u64 start = mem_device->start_addr; - u64 len = mem_device->length; + struct acpi_memory_info *info, *n; ACPI_FUNCTION_TRACE("acpi_memory_disable_device"); @@ -255,10 +308,13 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) * Ask the VM to offline this memory range. * Note: Assume that this function returns zero on success */ - result = remove_memory(start, len); - if (result) { - ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); - return_VALUE(result); + list_for_each_entry_safe(info, n, &mem_device->res_list, list) { + if (info->enabled) { + result = remove_memory(info->start_addr, info->length); + if (result) + return result; + } + kfree(info); } /* Power-off and eject the device */ @@ -356,6 +412,7 @@ static int acpi_memory_device_add(struct acpi_device *device) return_VALUE(-ENOMEM); memset(mem_device, 0, sizeof(struct acpi_memory_device)); + INIT_LIST_HEAD(&mem_device->res_list); mem_device->handle = device->handle; sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); @@ -391,6 +448,25 @@ static int acpi_memory_device_remove(struct acpi_device *device, int type) return_VALUE(0); } +static int acpi_memory_device_start (struct acpi_device *device) +{ + struct acpi_memory_device *mem_device; + int result = 0; + + ACPI_FUNCTION_TRACE("acpi_memory_device_start"); + + mem_device = acpi_driver_data(device); + + if (!acpi_memory_check_device(mem_device)) { + /* call add_memory func */ + result = acpi_memory_enable_device(mem_device); + if (result) + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, + "Error in acpi_memory_enable_device\n")); + } + return_VALUE(result); +} + /* * Helper function to check for memory device */ diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index e2c1a16078c..13d6d5bdea2 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -254,5 +254,18 @@ int acpi_get_pxm(acpi_handle h) } while (ACPI_SUCCESS(status)); return -1; } - EXPORT_SYMBOL(acpi_get_pxm); + +int acpi_get_node(acpi_handle *handle) +{ + int pxm, node = -1; + + ACPI_FUNCTION_TRACE("acpi_get_node"); + + pxm = acpi_get_pxm(handle); + if (pxm >= 0) + node = acpi_map_pxm_to_node(pxm); + + return_VALUE(node); +} +EXPORT_SYMBOL(acpi_get_node); diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index f2eeaf9dc56..1bca86edf57 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -33,6 +33,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/pci.h> +#include <linux/poison.h> #include <linux/errno.h> #include <linux/atm.h> #include <linux/atmdev.h> @@ -754,7 +755,7 @@ static void process_txdone_queue (struct fs_dev *dev, struct queue *q) fs_kfree_skb (skb); fs_dprintk (FS_DEBUG_ALLOC, "Free trans-d: %p\n", td); - memset (td, 0x12, sizeof (struct FS_BPENTRY)); + memset (td, ATM_POISON_FREE, sizeof(struct FS_BPENTRY)); kfree (td); break; default: diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index dd712b24ec9..4bef76a2f3f 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -8,6 +8,7 @@ #include <linux/cpu.h> #include <linux/topology.h> #include <linux/device.h> +#include <linux/node.h> #include "base.h" @@ -57,13 +58,12 @@ static void __devinit register_cpu_control(struct cpu *cpu) { sysdev_create_file(&cpu->sysdev, &attr_online); } -void unregister_cpu(struct cpu *cpu, struct node *root) +void unregister_cpu(struct cpu *cpu) { int logical_cpu = cpu->sysdev.id; - if (root) - sysfs_remove_link(&root->sysdev.kobj, - kobject_name(&cpu->sysdev.kobj)); + unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu)); + sysdev_remove_file(&cpu->sysdev, &attr_online); sysdev_unregister(&cpu->sysdev); @@ -109,23 +109,21 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL); * * Initialize and register the CPU device. */ -int __devinit register_cpu(struct cpu *cpu, int num, struct node *root) +int __devinit register_cpu(struct cpu *cpu, int num) { int error; - cpu->node_id = cpu_to_node(num); cpu->sysdev.id = num; cpu->sysdev.cls = &cpu_sysdev_class; error = sysdev_register(&cpu->sysdev); - if (!error && root) - error = sysfs_create_link(&root->sysdev.kobj, - &cpu->sysdev.kobj, - kobject_name(&cpu->sysdev.kobj)); + if (!error && !cpu->no_control) register_cpu_control(cpu); if (!error) cpu_sys_devices[num] = &cpu->sysdev; + if (!error) + register_cpu_under_node(num, cpu_to_node(num)); #ifdef CONFIG_KEXEC if (!error) @@ -145,5 +143,13 @@ EXPORT_SYMBOL_GPL(get_cpu_sysdev); int __init cpu_dev_init(void) { - return sysdev_class_register(&cpu_sysdev_class); + int err; + + err = sysdev_class_register(&cpu_sysdev_class); +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (!err) + err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); +#endif + + return err; } diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c index e2f64f91ed0..33c5cce1560 100644 --- a/drivers/base/dmapool.c +++ b/drivers/base/dmapool.c @@ -7,6 +7,7 @@ #include <linux/dmapool.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/poison.h> /* * Pool allocator ... wraps the dma_alloc_coherent page allocator, so @@ -35,8 +36,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ }; #define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) -#define POOL_POISON_FREED 0xa7 /* !inuse */ -#define POOL_POISON_ALLOCATED 0xa9 /* !initted */ static DECLARE_MUTEX (pools_lock); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index dd547af4681..c6b7d9c4b65 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -306,11 +306,13 @@ static ssize_t memory_probe_store(struct class *class, const char *buf, size_t count) { u64 phys_addr; + int nid; int ret; phys_addr = simple_strtoull(buf, NULL, 0); - ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); + nid = memory_add_physaddr_to_nid(phys_addr); + ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); if (ret) count = ret; diff --git a/drivers/base/node.c b/drivers/base/node.c index c80c3aeed00..eae2bdc183b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -11,6 +11,7 @@ #include <linux/cpumask.h> #include <linux/topology.h> #include <linux/nodemask.h> +#include <linux/cpu.h> static struct sysdev_class node_class = { set_kset_name("node"), @@ -190,6 +191,66 @@ void unregister_node(struct node *node) sysdev_unregister(&node->sysdev); } +struct node node_devices[MAX_NUMNODES]; + +/* + * register cpu under node + */ +int register_cpu_under_node(unsigned int cpu, unsigned int nid) +{ + if (node_online(nid)) { + struct sys_device *obj = get_cpu_sysdev(cpu); + if (!obj) + return 0; + return sysfs_create_link(&node_devices[nid].sysdev.kobj, + &obj->kobj, + kobject_name(&obj->kobj)); + } + + return 0; +} + +int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) +{ + if (node_online(nid)) { + struct sys_device *obj = get_cpu_sysdev(cpu); + if (obj) + sysfs_remove_link(&node_devices[nid].sysdev.kobj, + kobject_name(&obj->kobj)); + } + return 0; +} + +int register_one_node(int nid) +{ + int error = 0; + int cpu; + + if (node_online(nid)) { + int p_node = parent_node(nid); + struct node *parent = NULL; + + if (p_node != nid) + parent = &node_devices[p_node]; + + error = register_node(&node_devices[nid], nid, parent); + + /* link cpu under this node */ + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == nid) + register_cpu_under_node(cpu, nid); + } + } + + return error; + +} + +void unregister_one_node(int nid) +{ + unregister_node(&node_devices[nid]); +} + static int __init register_node_type(void) { return sysdev_class_register(&node_class); diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 8c52421cbc5..c2d62163238 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -107,7 +107,7 @@ static int __cpuinit topology_remove_dev(struct sys_device * sys_dev) return 0; } -static int topology_cpu_callback(struct notifier_block *nfb, +static int __cpuinit topology_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -125,7 +125,7 @@ static int topology_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block topology_cpu_notifier = +static struct notifier_block __cpuinitdata topology_cpu_notifier = { .notifier_call = topology_cpu_callback, }; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 3610c572955..410d70cb76f 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -939,12 +939,35 @@ config MWAVE config SCx200_GPIO tristate "NatSemi SCx200 GPIO Support" depends on SCx200 + select NSC_GPIO help Give userspace access to the GPIO pins on the National Semiconductor SCx200 processors. If compiled as a module, it will be called scx200_gpio. +config PC8736x_GPIO + tristate "NatSemi PC8736x GPIO Support" + depends on X86 + default SCx200_GPIO # mostly N + select NSC_GPIO # needed for support routines + help + Give userspace access to the GPIO pins on the National + Semiconductor PC-8736x (x=[03456]) SuperIO chip. The chip + has multiple functional units, inc several managed by + hwmon/pc87360 driver. Tested with PC-87366 + + If compiled as a module, it will be called pc8736x_gpio. + +config NSC_GPIO + tristate "NatSemi Base GPIO Support" + # selected by SCx200_GPIO and PC8736x_GPIO + # what about 2 selectors differing: m != y + help + Common support used (and needed) by scx200_gpio and + pc8736x_gpio drivers. If those drivers are built as + modules, this one will be too, named nsc_gpio + config CS5535_GPIO tristate "AMD CS5535/CS5536 GPIO (Geode Companion Device)" depends on X86_32 diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 524105597ea..6e0f4469d8b 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -82,6 +82,8 @@ obj-$(CONFIG_PPDEV) += ppdev.o obj-$(CONFIG_NWBUTTON) += nwbutton.o obj-$(CONFIG_NWFLASH) += nwflash.o obj-$(CONFIG_SCx200_GPIO) += scx200_gpio.o +obj-$(CONFIG_PC8736x_GPIO) += pc8736x_gpio.o +obj-$(CONFIG_NSC_GPIO) += nsc_gpio.o obj-$(CONFIG_CS5535_GPIO) += cs5535_gpio.o obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu.o obj-$(CONFIG_TANBAC_TB0219) += tb0219.o diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index cfa7922cb43..d73be4c2db8 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -329,9 +329,8 @@ static int __devinit agp_sgi_init(void) static void __devexit agp_sgi_cleanup(void) { - if (sgi_tioca_agp_bridges) - kfree(sgi_tioca_agp_bridges); - sgi_tioca_agp_bridges=NULL; + kfree(sgi_tioca_agp_bridges); + sgi_tioca_agp_bridges = NULL; } module_init(agp_sgi_init); diff --git a/drivers/char/drm/drm_memory_debug.h b/drivers/char/drm/drm_memory_debug.h index 6543b9a14c4..d117cc99719 100644 --- a/drivers/char/drm/drm_memory_debug.h +++ b/drivers/char/drm/drm_memory_debug.h @@ -43,7 +43,7 @@ typedef struct drm_mem_stats { unsigned long bytes_freed; } drm_mem_stats_t; -static spinlock_t drm_mem_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(drm_mem_lock); static unsigned long drm_ram_available = 0; /* In pages */ static unsigned long drm_ram_used = 0; static drm_mem_stats_t drm_mem_stats[] = diff --git a/drivers/char/drm/via_dmablit.c b/drivers/char/drm/via_dmablit.c index b7f17457b42..78a81a4a99c 100644 --- a/drivers/char/drm/via_dmablit.c +++ b/drivers/char/drm/via_dmablit.c @@ -557,7 +557,7 @@ via_init_dmablit(drm_device_t *dev) blitq->num_outstanding = 0; blitq->is_active = 0; blitq->aborting = 0; - blitq->blit_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&blitq->blit_lock); for (j=0; j<VIA_NUM_BLIT_SLOTS; ++j) { DRM_INIT_WAITQUEUE(blitq->blit_queue + j); } diff --git a/drivers/char/epca.c b/drivers/char/epca.c index 9cad8501d62..dc0602ae850 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -80,7 +80,7 @@ static int invalid_lilo_config; /* The ISA boards do window flipping into the same spaces so its only sane with a single lock. It's still pretty efficient */ -static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(epca_lock); /* ----------------------------------------------------------------------- MAXBOARDS is typically 12, but ISA and EISA cards are restricted to diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c index 8d97b391129..afa26b65dac 100644 --- a/drivers/char/hvcs.c +++ b/drivers/char/hvcs.c @@ -1320,11 +1320,12 @@ static struct tty_operations hvcs_ops = { static int hvcs_alloc_index_list(int n) { int i; + hvcs_index_list = kmalloc(n * sizeof(hvcs_index_count),GFP_KERNEL); if (!hvcs_index_list) return -ENOMEM; hvcs_index_count = n; - for(i = 0; i < hvcs_index_count; i++) + for (i = 0; i < hvcs_index_count; i++) hvcs_index_list[i] = -1; return 0; } @@ -1332,11 +1333,9 @@ static int hvcs_alloc_index_list(int n) static void hvcs_free_index_list(void) { /* Paranoia check to be thorough. */ - if (hvcs_index_list) { - kfree(hvcs_index_list); - hvcs_index_list = NULL; - hvcs_index_count = 0; - } + kfree(hvcs_index_list); + hvcs_index_list = NULL; + hvcs_index_count = 0; } static int __init hvcs_module_init(void) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index b03ddab1bef..83ed6ae466a 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -57,8 +57,7 @@ static int ipmi_init_msghandler(void); static int initialized = 0; #ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_ipmi_root = NULL; -EXPORT_SYMBOL(proc_ipmi_root); +static struct proc_dir_entry *proc_ipmi_root = NULL; #endif /* CONFIG_PROC_FS */ #define MAX_EVENTS_IN_QUEUE 25 diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 02a7dd7a8a5..101c14b9b26 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -809,7 +809,7 @@ static int ipmi_thread(void *data) /* do nothing */ } else if (smi_result == SI_SM_CALL_WITH_DELAY) - udelay(1); + schedule(); else schedule_timeout_interruptible(1); } diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c index f43c2e04ead..01247cccb89 100644 --- a/drivers/char/moxa.c +++ b/drivers/char/moxa.c @@ -301,7 +301,7 @@ static struct tty_operations moxa_ops = { .tiocmset = moxa_tiocmset, }; -static spinlock_t moxa_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(moxa_lock); #ifdef CONFIG_PCI static int moxa_get_PCI_conf(struct pci_dev *p, int board_type, moxa_board_conf * board) diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c new file mode 100644 index 00000000000..5b91e4e2564 --- /dev/null +++ b/drivers/char/nsc_gpio.c @@ -0,0 +1,142 @@ +/* linux/drivers/char/nsc_gpio.c + + National Semiconductor common GPIO device-file/VFS methods. + Allows a user space process to control the GPIO pins. + + Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> + Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com> +*/ + +#include <linux/config.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/nsc_gpio.h> +#include <linux/platform_device.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +#define NAME "nsc_gpio" + +void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index) +{ + /* retrieve current config w/o changing it */ + u32 config = amp->gpio_config(index, ~0, 0); + + /* user requested via 'v' command, so its INFO */ + dev_info(amp->dev, "io%02u: 0x%04x %s %s %s %s %s %s %s\tio:%d/%d\n", + index, config, + (config & 1) ? "OE" : "TS", /* output-enabled/tristate */ + (config & 2) ? "PP" : "OD", /* push pull / open drain */ + (config & 4) ? "PUE" : "PUD", /* pull up enabled/disabled */ + (config & 8) ? "LOCKED" : "", /* locked / unlocked */ + (config & 16) ? "LEVEL" : "EDGE",/* level/edge input */ + (config & 32) ? "HI" : "LO", /* trigger on rise/fall edge */ + (config & 64) ? "DEBOUNCE" : "", /* debounce */ + + amp->gpio_get(index), amp->gpio_current(index)); +} + +ssize_t nsc_gpio_write(struct file *file, const char __user *data, + size_t len, loff_t *ppos) +{ + unsigned m = iminor(file->f_dentry->d_inode); + struct nsc_gpio_ops *amp = file->private_data; + struct device *dev = amp->dev; + size_t i; + int err = 0; + + for (i = 0; i < len; ++i) { + char c; + if (get_user(c, data + i)) + return -EFAULT; + switch (c) { + case '0': + amp->gpio_set(m, 0); + break; + case '1': + amp->gpio_set(m, 1); + break; + case 'O': + dev_dbg(dev, "GPIO%d output enabled\n", m); + amp->gpio_config(m, ~1, 1); + break; + case 'o': + dev_dbg(dev, "GPIO%d output disabled\n", m); + amp->gpio_config(m, ~1, 0); + break; + case 'T': + dev_dbg(dev, "GPIO%d output is push pull\n", + m); + amp->gpio_config(m, ~2, 2); + break; + case 't': + dev_dbg(dev, "GPIO%d output is open drain\n", + m); + amp->gpio_config(m, ~2, 0); + break; + case 'P': + dev_dbg(dev, "GPIO%d pull up enabled\n", m); + amp->gpio_config(m, ~4, 4); + break; + case 'p': + dev_dbg(dev, "GPIO%d pull up disabled\n", m); + amp->gpio_config(m, ~4, 0); + break; + case 'v': + /* View Current pin settings */ + amp->gpio_dump(amp, m); + break; + case '\n': + /* end of settings string, do nothing */ + break; + default: + dev_err(dev, "io%2d bad setting: chr<0x%2x>\n", + m, (int)c); + err++; + } + } + if (err) + return -EINVAL; /* full string handled, report error */ + + return len; +} + +ssize_t nsc_gpio_read(struct file *file, char __user * buf, + size_t len, loff_t * ppos) +{ + unsigned m = iminor(file->f_dentry->d_inode); + int value; + struct nsc_gpio_ops *amp = file->private_data; + + value = amp->gpio_get(m); + if (put_user(value ? '1' : '0', buf)) + return -EFAULT; + + return 1; +} + +/* common file-ops routines for both scx200_gpio and pc87360_gpio */ +EXPORT_SYMBOL(nsc_gpio_write); +EXPORT_SYMBOL(nsc_gpio_read); +EXPORT_SYMBOL(nsc_gpio_dump); + +static int __init nsc_gpio_init(void) +{ + printk(KERN_DEBUG NAME " initializing\n"); + return 0; +} + +static void __exit nsc_gpio_cleanup(void) +{ + printk(KERN_DEBUG NAME " cleanup\n"); +} + +module_init(nsc_gpio_init); +module_exit(nsc_gpio_cleanup); + +MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>"); +MODULE_DESCRIPTION("NatSemi GPIO Common Methods"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c new file mode 100644 index 00000000000..1c706ccfdbb --- /dev/null +++ b/drivers/char/pc8736x_gpio.c @@ -0,0 +1,340 @@ +/* linux/drivers/char/pc8736x_gpio.c + + National Semiconductor PC8736x GPIO driver. Allows a user space + process to play with the GPIO pins. + + Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com> + + adapted from linux/drivers/char/scx200_gpio.c + Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>, +*/ + +#include <linux/config.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/ioport.h> +#include <linux/mutex.h> +#include <linux/nsc_gpio.h> +#include <linux/platform_device.h> +#include <asm/uaccess.h> + +#define DEVNAME "pc8736x_gpio" + +MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>"); +MODULE_DESCRIPTION("NatSemi PC-8736x GPIO Pin Driver"); +MODULE_LICENSE("GPL"); + +static int major; /* default to dynamic major */ +module_param(major, int, 0); +MODULE_PARM_DESC(major, "Major device number"); + +static DEFINE_MUTEX(pc8736x_gpio_config_lock); +static unsigned pc8736x_gpio_base; +static u8 pc8736x_gpio_shadow[4]; + +#define SIO_BASE1 0x2E /* 1st command-reg to check */ +#define SIO_BASE2 0x4E /* alt command-reg to check */ +#define SIO_BASE_OFFSET 0x20 + +#define SIO_SID 0x20 /* SuperI/O ID Register */ +#define SIO_SID_VALUE 0xe9 /* Expected value in SuperI/O ID Register */ + +#define SIO_CF1 0x21 /* chip config, bit0 is chip enable */ + +#define PC8736X_GPIO_SIZE 16 + +#define SIO_UNIT_SEL 0x7 /* unit select reg */ +#define SIO_UNIT_ACT 0x30 /* unit enable */ +#define SIO_GPIO_UNIT 0x7 /* unit number of GPIO */ +#define SIO_VLM_UNIT 0x0D +#define SIO_TMS_UNIT 0x0E + +/* config-space addrs to read/write each unit's runtime addr */ +#define SIO_BASE_HADDR 0x60 +#define SIO_BASE_LADDR 0x61 + +/* GPIO config-space pin-control addresses */ +#define SIO_GPIO_PIN_SELECT 0xF0 +#define SIO_GPIO_PIN_CONFIG 0xF1 +#define SIO_GPIO_PIN_EVENT 0xF2 + +static unsigned char superio_cmd = 0; +static unsigned char selected_device = 0xFF; /* bogus start val */ + +/* GPIO port runtime access, functionality */ +static int port_offset[] = { 0, 4, 8, 10 }; /* non-uniform offsets ! */ +/* static int event_capable[] = { 1, 1, 0, 0 }; ports 2,3 are hobbled */ + +#define PORT_OUT 0 +#define PORT_IN 1 +#define PORT_EVT_EN 2 +#define PORT_EVT_STST 3 + +static struct platform_device *pdev; /* use in dev_*() */ + +static inline void superio_outb(int addr, int val) +{ + outb_p(addr, superio_cmd); + outb_p(val, superio_cmd + 1); +} + +static inline int superio_inb(int addr) +{ + outb_p(addr, superio_cmd); + return inb_p(superio_cmd + 1); +} + +static int pc8736x_superio_present(void) +{ + /* try the 2 possible values, read a hardware reg to verify */ + superio_cmd = SIO_BASE1; + if (superio_inb(SIO_SID) == SIO_SID_VALUE) + return superio_cmd; + + superio_cmd = SIO_BASE2; + if (superio_inb(SIO_SID) == SIO_SID_VALUE) + return superio_cmd; + + return 0; +} + +static void device_select(unsigned devldn) +{ + superio_outb(SIO_UNIT_SEL, devldn); + selected_device = devldn; +} + +static void select_pin(unsigned iminor) +{ + /* select GPIO port/pin from device minor number */ + device_select(SIO_GPIO_UNIT); + superio_outb(SIO_GPIO_PIN_SELECT, + ((iminor << 1) & 0xF0) | (iminor & 0x7)); +} + +static inline u32 pc8736x_gpio_configure_fn(unsigned index, u32 mask, u32 bits, + u32 func_slct) +{ + u32 config, new_config; + + mutex_lock(&pc8736x_gpio_config_lock); + + device_select(SIO_GPIO_UNIT); + select_pin(index); + + /* read current config value */ + config = superio_inb(func_slct); + + /* set new config */ + new_config = (config & mask) | bits; + superio_outb(func_slct, new_config); + + mutex_unlock(&pc8736x_gpio_config_lock); + + return config; +} + +static u32 pc8736x_gpio_configure(unsigned index, u32 mask, u32 bits) +{ + return pc8736x_gpio_configure_fn(index, mask, bits, + SIO_GPIO_PIN_CONFIG); +} + +static int pc8736x_gpio_get(unsigned minor) +{ + int port, bit, val; + + port = minor >> 3; + bit = minor & 7; + val = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_IN); + val >>= bit; + val &= 1; + + dev_dbg(&pdev->dev, "_gpio_get(%d from %x bit %d) == val %d\n", + minor, pc8736x_gpio_base + port_offset[port] + PORT_IN, bit, + val); + + return val; +} + +static void pc8736x_gpio_set(unsigned minor, int val) +{ + int port, bit, curval; + + minor &= 0x1f; + port = minor >> 3; + bit = minor & 7; + curval = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_OUT); + + dev_dbg(&pdev->dev, "addr:%x cur:%x bit-pos:%d cur-bit:%x + new:%d -> bit-new:%d\n", + pc8736x_gpio_base + port_offset[port] + PORT_OUT, + curval, bit, (curval & ~(1 << bit)), val, (val << bit)); + + val = (curval & ~(1 << bit)) | (val << bit); + + dev_dbg(&pdev->dev, "gpio_set(minor:%d port:%d bit:%d)" + " %2x -> %2x\n", minor, port, bit, curval, val); + + outb_p(val, pc8736x_gpio_base + port_offset[port] + PORT_OUT); + + curval = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_OUT); + val = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_IN); + + dev_dbg(&pdev->dev, "wrote %x, read: %x\n", curval, val); + pc8736x_gpio_shadow[port] = val; +} + +static void pc8736x_gpio_set_high(unsigned index) +{ + pc8736x_gpio_set(index, 1); +} + +static void pc8736x_gpio_set_low(unsigned index) +{ + pc8736x_gpio_set(index, 0); +} + +static int pc8736x_gpio_current(unsigned minor) +{ + int port, bit; + minor &= 0x1f; + port = minor >> 3; + bit = minor & 7; + return ((pc8736x_gpio_shadow[port] >> bit) & 0x01); +} + +static void pc8736x_gpio_change(unsigned index) +{ + pc8736x_gpio_set(index, !pc8736x_gpio_current(index)); +} + +static struct nsc_gpio_ops pc8736x_access = { + .owner = THIS_MODULE, + .gpio_config = pc8736x_gpio_configure, + .gpio_dump = nsc_gpio_dump, + .gpio_get = pc8736x_gpio_get, + .gpio_set = pc8736x_gpio_set, + .gpio_set_high = pc8736x_gpio_set_high, + .gpio_set_low = pc8736x_gpio_set_low, + .gpio_change = pc8736x_gpio_change, + .gpio_current = pc8736x_gpio_current +}; + +static int pc8736x_gpio_open(struct inode *inode, struct file *file) +{ + unsigned m = iminor(inode); + file->private_data = &pc8736x_access; + + dev_dbg(&pdev->dev, "open %d\n", m); + + if (m > 63) + return -EINVAL; + return nonseekable_open(inode, file); +} + +static struct file_operations pc8736x_gpio_fops = { + .owner = THIS_MODULE, + .open = pc8736x_gpio_open, + .write = nsc_gpio_write, + .read = nsc_gpio_read, +}; + +static void __init pc8736x_init_shadow(void) +{ + int port; + + /* read the current values driven on the GPIO signals */ + for (port = 0; port < 4; ++port) + pc8736x_gpio_shadow[port] + = inb_p(pc8736x_gpio_base + port_offset[port] + + PORT_OUT); + +} + +static int __init pc8736x_gpio_init(void) +{ + int rc = 0; + + pdev = platform_device_alloc(DEVNAME, 0); + if (!pdev) + return -ENOMEM; + + rc = platform_device_add(pdev); + if (rc) { + rc = -ENODEV; + goto undo_platform_dev_alloc; + } + dev_info(&pdev->dev, "NatSemi pc8736x GPIO Driver Initializing\n"); + + if (!pc8736x_superio_present()) { + rc = -ENODEV; + dev_err(&pdev->dev, "no device found\n"); + goto undo_platform_dev_add; + } + pc8736x_access.dev = &pdev->dev; + + /* Verify that chip and it's GPIO unit are both enabled. + My BIOS does this, so I take minimum action here + */ + rc = superio_inb(SIO_CF1); + if (!(rc & 0x01)) { + rc = -ENODEV; + dev_err(&pdev->dev, "device not enabled\n"); + goto undo_platform_dev_add; + } + device_select(SIO_GPIO_UNIT); + if (!superio_inb(SIO_UNIT_ACT)) { + rc = -ENODEV; + dev_err(&pdev->dev, "GPIO unit not enabled\n"); + goto undo_platform_dev_add; + } + + /* read the GPIO unit base addr that chip responds to */ + pc8736x_gpio_base = (superio_inb(SIO_BASE_HADDR) << 8 + | superio_inb(SIO_BASE_LADDR)); + + if (!request_region(pc8736x_gpio_base, 16, DEVNAME)) { + rc = -ENODEV; + dev_err(&pdev->dev, "GPIO ioport %x busy\n", + pc8736x_gpio_base); + goto undo_platform_dev_add; + } + dev_info(&pdev->dev, "GPIO ioport %x reserved\n", pc8736x_gpio_base); + + rc = register_chrdev(major, DEVNAME, &pc8736x_gpio_fops); + if (rc < 0) { + dev_err(&pdev->dev, "register-chrdev failed: %d\n", rc); + goto undo_platform_dev_add; + } + if (!major) { + major = rc; + dev_dbg(&pdev->dev, "got dynamic major %d\n", major); + } + + pc8736x_init_shadow(); + return 0; + +undo_platform_dev_add: + platform_device_put(pdev); +undo_platform_dev_alloc: + kfree(pdev); + return rc; +} + +static void __exit pc8736x_gpio_cleanup(void) +{ + dev_dbg(&pdev->dev, " cleanup\n"); + + release_region(pc8736x_gpio_base, 16); + + unregister_chrdev(major, DEVNAME); +} + +EXPORT_SYMBOL(pc8736x_access); + +module_init(pc8736x_gpio_init); +module_exit(pc8736x_gpio_cleanup); diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c index 664a6e97eb1..5a280a33040 100644 --- a/drivers/char/scx200_gpio.c +++ b/drivers/char/scx200_gpio.c @@ -1,4 +1,4 @@ -/* linux/drivers/char/scx200_gpio.c +/* linux/drivers/char/scx200_gpio.c National Semiconductor SCx200 GPIO driver. Allows a user space process to play with the GPIO pins. @@ -6,17 +6,26 @@ Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> */ #include <linux/config.h> +#include <linux/device.h> #include <linux/fs.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/platform_device.h> #include <asm/uaccess.h> #include <asm/io.h> +#include <linux/types.h> +#include <linux/cdev.h> + #include <linux/scx200_gpio.h> +#include <linux/nsc_gpio.h> #define NAME "scx200_gpio" +#define DEVNAME NAME + +static struct platform_device *pdev; MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>"); MODULE_DESCRIPTION("NatSemi SCx200 GPIO Pin Driver"); @@ -26,70 +35,23 @@ static int major = 0; /* default to dynamic major */ module_param(major, int, 0); MODULE_PARM_DESC(major, "Major device number"); -static ssize_t scx200_gpio_write(struct file *file, const char __user *data, - size_t len, loff_t *ppos) -{ - unsigned m = iminor(file->f_dentry->d_inode); - size_t i; - - for (i = 0; i < len; ++i) { - char c; - if (get_user(c, data+i)) - return -EFAULT; - switch (c) - { - case '0': - scx200_gpio_set(m, 0); - break; - case '1': - scx200_gpio_set(m, 1); - break; - case 'O': - printk(KERN_INFO NAME ": GPIO%d output enabled\n", m); - scx200_gpio_configure(m, ~1, 1); - break; - case 'o': - printk(KERN_INFO NAME ": GPIO%d output disabled\n", m); - scx200_gpio_configure(m, ~1, 0); - break; - case 'T': - printk(KERN_INFO NAME ": GPIO%d output is push pull\n", m); - scx200_gpio_configure(m, ~2, 2); - break; - case 't': - printk(KERN_INFO NAME ": GPIO%d output is open drain\n", m); - scx200_gpio_configure(m, ~2, 0); - break; - case 'P': - printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m); - scx200_gpio_configure(m, ~4, 4); - break; - case 'p': - printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m); - scx200_gpio_configure(m, ~4, 0); - break; - } - } - - return len; -} - -static ssize_t scx200_gpio_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - unsigned m = iminor(file->f_dentry->d_inode); - int value; - - value = scx200_gpio_get(m); - if (put_user(value ? '1' : '0', buf)) - return -EFAULT; - - return 1; -} +struct nsc_gpio_ops scx200_access = { + .owner = THIS_MODULE, + .gpio_config = scx200_gpio_configure, + .gpio_dump = nsc_gpio_dump, + .gpio_get = scx200_gpio_get, + .gpio_set = scx200_gpio_set, + .gpio_set_high = scx200_gpio_set_high, + .gpio_set_low = scx200_gpio_set_low, + .gpio_change = scx200_gpio_change, + .gpio_current = scx200_gpio_current +}; static int scx200_gpio_open(struct inode *inode, struct file *file) { unsigned m = iminor(inode); + file->private_data = &scx200_access; + if (m > 63) return -EINVAL; return nonseekable_open(inode, file); @@ -103,47 +65,81 @@ static int scx200_gpio_release(struct inode *inode, struct file *file) static struct file_operations scx200_gpio_fops = { .owner = THIS_MODULE, - .write = scx200_gpio_write, - .read = scx200_gpio_read, + .write = nsc_gpio_write, + .read = nsc_gpio_read, .open = scx200_gpio_open, .release = scx200_gpio_release, }; +struct cdev *scx200_devices; +static int num_pins = 32; + static int __init scx200_gpio_init(void) { - int r; - - printk(KERN_DEBUG NAME ": NatSemi SCx200 GPIO Driver\n"); + int rc, i; + dev_t dev = MKDEV(major, 0); if (!scx200_gpio_present()) { - printk(KERN_ERR NAME ": no SCx200 gpio pins available\n"); + printk(KERN_ERR NAME ": no SCx200 gpio present\n"); return -ENODEV; } - r = register_chrdev(major, NAME, &scx200_gpio_fops); - if (r < 0) { - printk(KERN_ERR NAME ": unable to register character device\n"); - return r; + /* support dev_dbg() with pdev->dev */ + pdev = platform_device_alloc(DEVNAME, 0); + if (!pdev) + return -ENOMEM; + + rc = platform_device_add(pdev); + if (rc) + goto undo_malloc; + + /* nsc_gpio uses dev_dbg(), so needs this */ + scx200_access.dev = &pdev->dev; + + if (major) + rc = register_chrdev_region(dev, num_pins, "scx200_gpio"); + else { + rc = alloc_chrdev_region(&dev, 0, num_pins, "scx200_gpio"); + major = MAJOR(dev); } - if (!major) { - major = r; - printk(KERN_DEBUG NAME ": got dynamic major %d\n", major); + if (rc < 0) { + dev_err(&pdev->dev, "SCx200 chrdev_region err: %d\n", rc); + goto undo_platform_device_add; + } + scx200_devices = kzalloc(num_pins * sizeof(struct cdev), GFP_KERNEL); + if (!scx200_devices) { + rc = -ENOMEM; + goto undo_chrdev_region; + } + for (i = 0; i < num_pins; i++) { + struct cdev *cdev = &scx200_devices[i]; + cdev_init(cdev, &scx200_gpio_fops); + cdev->owner = THIS_MODULE; + rc = cdev_add(cdev, MKDEV(major, i), 1); + /* tolerate 'minor' errors */ + if (rc) + dev_err(&pdev->dev, "Error %d on minor %d", rc, i); } - return 0; + return 0; /* succeed */ + +undo_chrdev_region: + unregister_chrdev_region(dev, num_pins); +undo_platform_device_add: + platform_device_put(pdev); +undo_malloc: + kfree(pdev); + return rc; } static void __exit scx200_gpio_cleanup(void) { - unregister_chrdev(major, NAME); + kfree(scx200_devices); + unregister_chrdev_region(MKDEV(major, 0), num_pins); + platform_device_put(pdev); + platform_device_unregister(pdev); + /* kfree(pdev); */ } module_init(scx200_gpio_init); module_exit(scx200_gpio_cleanup); - -/* - Local variables: - compile-command: "make -k -C ../.. SUBDIRS=drivers/char modules" - c-basic-offset: 8 - End: -*/ diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c index 1b5330299e3..d2d6b01dcd0 100644 --- a/drivers/char/specialix.c +++ b/drivers/char/specialix.c @@ -2477,7 +2477,7 @@ static int __init specialix_init(void) #endif for (i = 0; i < SX_NBOARD; i++) - sx_board[i].lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&sx_board[i].lock); if (sx_init_drivers()) { func_exit(); diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c index a9c5a7230f8..bf361a5ba70 100644 --- a/drivers/char/stallion.c +++ b/drivers/char/stallion.c @@ -141,15 +141,6 @@ static char *stl_drvversion = "5.6.0"; static struct tty_driver *stl_serial; /* - * We will need to allocate a temporary write buffer for chars that - * come direct from user space. The problem is that a copy from user - * space might cause a page fault (typically on a system that is - * swapping!). All ports will share one buffer - since if the system - * is already swapping a shared buffer won't make things any worse. - */ -static char *stl_tmpwritebuf; - -/* * Define a local default termios struct. All ports will be created * with this termios initially. Basically all it defines is a raw port * at 9600, 8 data bits, 1 stop bit. @@ -363,6 +354,14 @@ static unsigned char stl_vecmap[] = { }; /* + * Lock ordering is that you may not take stallion_lock holding + * brd_lock. + */ + +static spinlock_t brd_lock; /* Guard the board mapping */ +static spinlock_t stallion_lock; /* Guard the tty driver */ + +/* * Set up enable and disable macros for the ECH boards. They require * the secondary io address space to be activated and deactivated. * This way all ECH boards can share their secondary io region. @@ -725,17 +724,7 @@ static struct class *stallion_class; static int __init stallion_module_init(void) { - unsigned long flags; - -#ifdef DEBUG - printk("init_module()\n"); -#endif - - save_flags(flags); - cli(); stl_init(); - restore_flags(flags); - return 0; } @@ -746,7 +735,6 @@ static void __exit stallion_module_exit(void) stlbrd_t *brdp; stlpanel_t *panelp; stlport_t *portp; - unsigned long flags; int i, j, k; #ifdef DEBUG @@ -756,9 +744,6 @@ static void __exit stallion_module_exit(void) printk(KERN_INFO "Unloading %s: version %s\n", stl_drvtitle, stl_drvversion); - save_flags(flags); - cli(); - /* * Free up all allocated resources used by the ports. This includes * memory and interrupts. As part of this process we will also do @@ -770,7 +755,6 @@ static void __exit stallion_module_exit(void) if (i) { printk("STALLION: failed to un-register tty driver, " "errno=%d\n", -i); - restore_flags(flags); return; } for (i = 0; i < 4; i++) { @@ -783,8 +767,6 @@ static void __exit stallion_module_exit(void) "errno=%d\n", -i); class_destroy(stallion_class); - kfree(stl_tmpwritebuf); - for (i = 0; (i < stl_nrbrds); i++) { if ((brdp = stl_brds[i]) == (stlbrd_t *) NULL) continue; @@ -814,8 +796,6 @@ static void __exit stallion_module_exit(void) kfree(brdp); stl_brds[i] = (stlbrd_t *) NULL; } - - restore_flags(flags); } module_init(stallion_module_init); @@ -948,7 +928,7 @@ static stlbrd_t *stl_allocbrd(void) brdp = kzalloc(sizeof(stlbrd_t), GFP_KERNEL); if (!brdp) { - printk("STALLION: failed to allocate memory (size=%d)\n", + printk("STALLION: failed to allocate memory (size=%Zd)\n", sizeof(stlbrd_t)); return NULL; } @@ -1066,16 +1046,17 @@ static int stl_waitcarrier(stlport_t *portp, struct file *filp) rc = 0; doclocal = 0; + spin_lock_irqsave(&stallion_lock, flags); + if (portp->tty->termios->c_cflag & CLOCAL) doclocal++; - save_flags(flags); - cli(); portp->openwaitcnt++; if (! tty_hung_up_p(filp)) portp->refcount--; for (;;) { + /* Takes brd_lock internally */ stl_setsignals(portp, 1, 1); if (tty_hung_up_p(filp) || ((portp->flags & ASYNC_INITIALIZED) == 0)) { @@ -1093,13 +1074,14 @@ static int stl_waitcarrier(stlport_t *portp, struct file *filp) rc = -ERESTARTSYS; break; } + /* FIXME */ interruptible_sleep_on(&portp->open_wait); } if (! tty_hung_up_p(filp)) portp->refcount++; portp->openwaitcnt--; - restore_flags(flags); + spin_unlock_irqrestore(&stallion_lock, flags); return rc; } @@ -1119,16 +1101,15 @@ static void stl_close(struct tty_struct *tty, struct file *filp) if (portp == (stlport_t *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&stallion_lock, flags); if (tty_hung_up_p(filp)) { - restore_flags(flags); + spin_unlock_irqrestore(&stallion_lock, flags); return; } if ((tty->count == 1) && (portp->refcount != 1)) portp->refcount = 1; if (portp->refcount-- > 1) { - restore_flags(flags); + spin_unlock_irqrestore(&stallion_lock, flags); return; } @@ -1142,11 +1123,18 @@ static void stl_close(struct tty_struct *tty, struct file *filp) * (The sc26198 has no "end-of-data" interrupt only empty FIFO) */ tty->closing = 1; + + spin_unlock_irqrestore(&stallion_lock, flags); + if (portp->closing_wait != ASYNC_CLOSING_WAIT_NONE) tty_wait_until_sent(tty, portp->closing_wait); stl_waituntilsent(tty, (HZ / 2)); + + spin_lock_irqsave(&stallion_lock, flags); portp->flags &= ~ASYNC_INITIALIZED; + spin_unlock_irqrestore(&stallion_lock, flags); + stl_disableintrs(portp); if (tty->termios->c_cflag & HUPCL) stl_setsignals(portp, 0, 0); @@ -1173,7 +1161,6 @@ static void stl_close(struct tty_struct *tty, struct file *filp) portp->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING); wake_up_interruptible(&portp->close_wait); - restore_flags(flags); } /*****************************************************************************/ @@ -1195,9 +1182,6 @@ static int stl_write(struct tty_struct *tty, const unsigned char *buf, int count (int) tty, (int) buf, count); #endif - if ((tty == (struct tty_struct *) NULL) || - (stl_tmpwritebuf == (char *) NULL)) - return 0; portp = tty->driver_data; if (portp == (stlport_t *) NULL) return 0; @@ -1302,11 +1286,6 @@ static void stl_flushchars(struct tty_struct *tty) if (portp->tx.buf == (char *) NULL) return; -#if 0 - if (tty->stopped || tty->hw_stopped || - (portp->tx.head == portp->tx.tail)) - return; -#endif stl_startrxtx(portp, -1, 1); } @@ -1977,12 +1956,14 @@ static int stl_eiointr(stlbrd_t *brdp) unsigned int iobase; int handled = 0; + spin_lock(&brd_lock); panelp = brdp->panels[0]; iobase = panelp->iobase; while (inb(brdp->iostatus) & EIO_INTRPEND) { handled = 1; (* panelp->isr)(panelp, iobase); } + spin_unlock(&brd_lock); return handled; } @@ -2168,7 +2149,7 @@ static int __init stl_initports(stlbrd_t *brdp, stlpanel_t *panelp) portp = kzalloc(sizeof(stlport_t), GFP_KERNEL); if (!portp) { printk("STALLION: failed to allocate memory " - "(size=%d)\n", sizeof(stlport_t)); + "(size=%Zd)\n", sizeof(stlport_t)); break; } @@ -2304,7 +2285,7 @@ static inline int stl_initeio(stlbrd_t *brdp) panelp = kzalloc(sizeof(stlpanel_t), GFP_KERNEL); if (!panelp) { printk(KERN_WARNING "STALLION: failed to allocate memory " - "(size=%d)\n", sizeof(stlpanel_t)); + "(size=%Zd)\n", sizeof(stlpanel_t)); return -ENOMEM; } @@ -2478,7 +2459,7 @@ static inline int stl_initech(stlbrd_t *brdp) panelp = kzalloc(sizeof(stlpanel_t), GFP_KERNEL); if (!panelp) { printk("STALLION: failed to allocate memory " - "(size=%d)\n", sizeof(stlpanel_t)); + "(size=%Zd)\n", sizeof(stlpanel_t)); break; } panelp->magic = STL_PANELMAGIC; @@ -2879,8 +2860,7 @@ static int stl_getportstats(stlport_t *portp, comstats_t __user *cp) portp->stats.lflags = 0; portp->stats.rxbuffered = 0; - save_flags(flags); - cli(); + spin_lock_irqsave(&stallion_lock, flags); if (portp->tty != (struct tty_struct *) NULL) { if (portp->tty->driver_data == portp) { portp->stats.ttystate = portp->tty->flags; @@ -2894,7 +2874,7 @@ static int stl_getportstats(stlport_t *portp, comstats_t __user *cp) } } } - restore_flags(flags); + spin_unlock_irqrestore(&stallion_lock, flags); head = portp->tx.head; tail = portp->tx.tail; @@ -3056,14 +3036,6 @@ static int __init stl_init(void) return -1; /* - * Allocate a temporary write buffer. - */ - stl_tmpwritebuf = kmalloc(STL_TXBUFSIZE, GFP_KERNEL); - if (!stl_tmpwritebuf) - printk("STALLION: failed to allocate memory (size=%d)\n", - STL_TXBUFSIZE); - -/* * Set up a character driver for per board stuff. This is mainly used * to do stats ioctls on the ports. */ @@ -3147,11 +3119,13 @@ static int stl_cd1400panelinit(stlbrd_t *brdp, stlpanel_t *panelp) unsigned int gfrcr; int chipmask, i, j; int nrchips, uartaddr, ioaddr; + unsigned long flags; #ifdef DEBUG printk("stl_panelinit(brdp=%x,panelp=%x)\n", (int) brdp, (int) panelp); #endif + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(panelp->brdnr, panelp->pagenr); /* @@ -3189,6 +3163,7 @@ static int stl_cd1400panelinit(stlbrd_t *brdp, stlpanel_t *panelp) } BRDDISABLE(panelp->brdnr); + spin_unlock_irqrestore(&brd_lock, flags); return chipmask; } @@ -3200,6 +3175,7 @@ static int stl_cd1400panelinit(stlbrd_t *brdp, stlpanel_t *panelp) static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *portp) { + unsigned long flags; #ifdef DEBUG printk("stl_cd1400portinit(brdp=%x,panelp=%x,portp=%x)\n", (int) brdp, (int) panelp, (int) portp); @@ -3209,6 +3185,7 @@ static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *po (portp == (stlport_t *) NULL)) return; + spin_lock_irqsave(&brd_lock, flags); portp->ioaddr = panelp->iobase + (((brdp->brdtype == BRD_ECHPCI) || (portp->portnr < 8)) ? 0 : EREG_BANKSIZE); portp->uartaddr = (portp->portnr & 0x04) << 5; @@ -3219,6 +3196,7 @@ static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *po stl_cd1400setreg(portp, LIVR, (portp->portnr << 3)); portp->hwid = stl_cd1400getreg(portp, GFRCR); BRDDISABLE(portp->brdnr); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3428,8 +3406,7 @@ static void stl_cd1400setport(stlport_t *portp, struct termios *tiosp) tiosp->c_cc[VSTART], tiosp->c_cc[VSTOP]); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x3)); srer = stl_cd1400getreg(portp, SRER); @@ -3466,7 +3443,7 @@ static void stl_cd1400setport(stlport_t *portp, struct termios *tiosp) portp->sigs &= ~TIOCM_CD; stl_cd1400setreg(portp, SRER, ((srer & ~sreroff) | sreron)); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3492,8 +3469,7 @@ static void stl_cd1400setsignals(stlport_t *portp, int dtr, int rts) if (rts > 0) msvr2 = MSVR2_RTS; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); if (rts >= 0) @@ -3501,7 +3477,7 @@ static void stl_cd1400setsignals(stlport_t *portp, int dtr, int rts) if (dtr >= 0) stl_cd1400setreg(portp, MSVR1, msvr1); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3520,14 +3496,13 @@ static int stl_cd1400getsignals(stlport_t *portp) printk("stl_cd1400getsignals(portp=%x)\n", (int) portp); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); msvr1 = stl_cd1400getreg(portp, MSVR1); msvr2 = stl_cd1400getreg(portp, MSVR2); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); sigs = 0; sigs |= (msvr1 & MSVR1_DCD) ? TIOCM_CD : 0; @@ -3569,15 +3544,14 @@ static void stl_cd1400enablerxtx(stlport_t *portp, int rx, int tx) else if (rx > 0) ccr |= CCR_RXENABLE; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); stl_cd1400ccrwait(portp); stl_cd1400setreg(portp, CCR, ccr); stl_cd1400ccrwait(portp); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3609,8 +3583,7 @@ static void stl_cd1400startrxtx(stlport_t *portp, int rx, int tx) else if (rx > 0) sreron |= SRER_RXDATA; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); stl_cd1400setreg(portp, SRER, @@ -3618,7 +3591,7 @@ static void stl_cd1400startrxtx(stlport_t *portp, int rx, int tx) BRDDISABLE(portp->brdnr); if (tx > 0) set_bit(ASYI_TXBUSY, &portp->istate); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3634,13 +3607,12 @@ static void stl_cd1400disableintrs(stlport_t *portp) #ifdef DEBUG printk("stl_cd1400disableintrs(portp=%x)\n", (int) portp); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); stl_cd1400setreg(portp, SRER, 0); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3653,8 +3625,7 @@ static void stl_cd1400sendbreak(stlport_t *portp, int len) printk("stl_cd1400sendbreak(portp=%x,len=%d)\n", (int) portp, len); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); stl_cd1400setreg(portp, SRER, @@ -3664,7 +3635,7 @@ static void stl_cd1400sendbreak(stlport_t *portp, int len) portp->brklen = len; if (len == 1) portp->stats.txbreaks++; - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3688,8 +3659,7 @@ static void stl_cd1400flowctrl(stlport_t *portp, int state) if (tty == (struct tty_struct *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); @@ -3729,7 +3699,7 @@ static void stl_cd1400flowctrl(stlport_t *portp, int state) } BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3753,8 +3723,7 @@ static void stl_cd1400sendflow(stlport_t *portp, int state) if (tty == (struct tty_struct *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); if (state) { @@ -3769,7 +3738,7 @@ static void stl_cd1400sendflow(stlport_t *portp, int state) stl_cd1400ccrwait(portp); } BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3785,8 +3754,7 @@ static void stl_cd1400flush(stlport_t *portp) if (portp == (stlport_t *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03)); stl_cd1400ccrwait(portp); @@ -3794,7 +3762,7 @@ static void stl_cd1400flush(stlport_t *portp) stl_cd1400ccrwait(portp); portp->tx.tail = portp->tx.head; BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -3833,6 +3801,7 @@ static void stl_cd1400eiointr(stlpanel_t *panelp, unsigned int iobase) (int) panelp, iobase); #endif + spin_lock(&brd_lock); outb(SVRR, iobase); svrtype = inb(iobase + EREG_DATA); if (panelp->nrports > 4) { @@ -3846,6 +3815,8 @@ static void stl_cd1400eiointr(stlpanel_t *panelp, unsigned int iobase) stl_cd1400txisr(panelp, iobase); else if (svrtype & SVRR_MDM) stl_cd1400mdmisr(panelp, iobase); + + spin_unlock(&brd_lock); } /*****************************************************************************/ @@ -4433,8 +4404,7 @@ static void stl_sc26198setport(stlport_t *portp, struct termios *tiosp) tiosp->c_cc[VSTART], tiosp->c_cc[VSTOP]); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_sc26198setreg(portp, IMR, 0); stl_sc26198updatereg(portp, MR0, mr0); @@ -4461,7 +4431,7 @@ static void stl_sc26198setport(stlport_t *portp, struct termios *tiosp) portp->imr = (portp->imr & ~imroff) | imron; stl_sc26198setreg(portp, IMR, portp->imr); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4491,13 +4461,12 @@ static void stl_sc26198setsignals(stlport_t *portp, int dtr, int rts) else if (rts > 0) iopioron |= IPR_RTS; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_sc26198setreg(portp, IOPIOR, ((stl_sc26198getreg(portp, IOPIOR) & ~iopioroff) | iopioron)); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4516,12 +4485,11 @@ static int stl_sc26198getsignals(stlport_t *portp) printk("stl_sc26198getsignals(portp=%x)\n", (int) portp); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); ipr = stl_sc26198getreg(portp, IPR); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); sigs = 0; sigs |= (ipr & IPR_DCD) ? 0 : TIOCM_CD; @@ -4558,13 +4526,12 @@ static void stl_sc26198enablerxtx(stlport_t *portp, int rx, int tx) else if (rx > 0) ccr |= CR_RXENABLE; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_sc26198setreg(portp, SCCR, ccr); BRDDISABLE(portp->brdnr); portp->crenable = ccr; - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4593,15 +4560,14 @@ static void stl_sc26198startrxtx(stlport_t *portp, int rx, int tx) else if (rx > 0) imr |= IR_RXRDY | IR_RXBREAK | IR_RXWATCHDOG; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_sc26198setreg(portp, IMR, imr); BRDDISABLE(portp->brdnr); portp->imr = imr; if (tx > 0) set_bit(ASYI_TXBUSY, &portp->istate); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4618,13 +4584,12 @@ static void stl_sc26198disableintrs(stlport_t *portp) printk("stl_sc26198disableintrs(portp=%x)\n", (int) portp); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); portp->imr = 0; stl_sc26198setreg(portp, IMR, 0); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4637,8 +4602,7 @@ static void stl_sc26198sendbreak(stlport_t *portp, int len) printk("stl_sc26198sendbreak(portp=%x,len=%d)\n", (int) portp, len); #endif - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); if (len == 1) { stl_sc26198setreg(portp, SCCR, CR_TXSTARTBREAK); @@ -4647,7 +4611,7 @@ static void stl_sc26198sendbreak(stlport_t *portp, int len) stl_sc26198setreg(portp, SCCR, CR_TXSTOPBREAK); } BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4672,8 +4636,7 @@ static void stl_sc26198flowctrl(stlport_t *portp, int state) if (tty == (struct tty_struct *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); if (state) { @@ -4719,7 +4682,7 @@ static void stl_sc26198flowctrl(stlport_t *portp, int state) } BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4744,8 +4707,7 @@ static void stl_sc26198sendflow(stlport_t *portp, int state) if (tty == (struct tty_struct *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); if (state) { mr0 = stl_sc26198getreg(portp, MR0); @@ -4765,7 +4727,7 @@ static void stl_sc26198sendflow(stlport_t *portp, int state) stl_sc26198setreg(portp, MR0, mr0); } BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4781,14 +4743,13 @@ static void stl_sc26198flush(stlport_t *portp) if (portp == (stlport_t *) NULL) return; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); stl_sc26198setreg(portp, SCCR, CR_TXRESET); stl_sc26198setreg(portp, SCCR, portp->crenable); BRDDISABLE(portp->brdnr); portp->tx.tail = portp->tx.head; - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); } /*****************************************************************************/ @@ -4815,12 +4776,11 @@ static int stl_sc26198datastate(stlport_t *portp) if (test_bit(ASYI_TXBUSY, &portp->istate)) return 1; - save_flags(flags); - cli(); + spin_lock_irqsave(&brd_lock, flags); BRDENABLE(portp->brdnr, portp->pagenr); sr = stl_sc26198getreg(portp, SR); BRDDISABLE(portp->brdnr); - restore_flags(flags); + spin_unlock_irqrestore(&brd_lock, flags); return (sr & SR_TXEMPTY) ? 0 : 1; } @@ -4878,6 +4838,8 @@ static void stl_sc26198intr(stlpanel_t *panelp, unsigned int iobase) stlport_t *portp; unsigned int iack; + spin_lock(&brd_lock); + /* * Work around bug in sc26198 chip... Cannot have A6 address * line of UART high, else iack will be returned as 0. @@ -4893,6 +4855,8 @@ static void stl_sc26198intr(stlpanel_t *panelp, unsigned int iobase) stl_sc26198txisr(portp); else stl_sc26198otherisr(portp, iack); + + spin_unlock(&brd_lock); } /*****************************************************************************/ diff --git a/drivers/char/sx.c b/drivers/char/sx.c index 3b474723027..76b9107f7f8 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -2320,7 +2320,7 @@ static int sx_init_portstructs (int nboards, int nports) #ifdef NEW_WRITE_LOCKING port->gs.port_write_mutex = MUTEX; #endif - port->gs.driver_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&port->gs.driver_lock); /* * Initializing wait queue */ diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 8b2a5996986..bd74e82d8a7 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -2621,10 +2621,9 @@ int tty_ioctl(struct inode * inode, struct file * file, tty->driver->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ - /* - * XXX is the above comment correct, or the - * code below correct? Is this ioctl used at - * all by anyone? + /* non-zero arg means wait for all output data + * to be sent (performed above) but don't send break. + * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 44d1eca83a7..35e0b9ceecf 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1497,6 +1497,7 @@ int cpufreq_update_policy(unsigned int cpu) } EXPORT_SYMBOL(cpufreq_update_policy); +#ifdef CONFIG_HOTPLUG_CPU static int cpufreq_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1532,10 +1533,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cpufreq_cpu_notifier = +static struct notifier_block __cpuinitdata cpufreq_cpu_notifier = { .notifier_call = cpufreq_cpu_callback, }; +#endif /* CONFIG_HOTPLUG_CPU */ /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * @@ -1596,7 +1598,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) } if (!ret) { - register_cpu_notifier(&cpufreq_cpu_notifier); + register_hotcpu_notifier(&cpufreq_cpu_notifier); dprintk("driver %s up and running\n", driver_data->name); cpufreq_debug_enable_ratelimit(); } @@ -1628,7 +1630,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) dprintk("unregistering driver %s\n", driver->name); sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver); - unregister_cpu_notifier(&cpufreq_cpu_notifier); + unregister_hotcpu_notifier(&cpufreq_cpu_notifier); spin_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver = NULL; diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index c576c0b3f45..145061b8472 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -350,7 +350,7 @@ __init cpufreq_stats_init(void) return ret; } - register_cpu_notifier(&cpufreq_stat_cpu_notifier); + register_hotcpu_notifier(&cpufreq_stat_cpu_notifier); lock_cpu_hotplug(); for_each_online_cpu(cpu) { cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE, @@ -368,7 +368,7 @@ __exit cpufreq_stats_exit(void) CPUFREQ_POLICY_NOTIFIER); cpufreq_unregister_notifier(¬ifier_trans_block, CPUFREQ_TRANSITION_NOTIFIER); - unregister_cpu_notifier(&cpufreq_stat_cpu_notifier); + unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier); lock_cpu_hotplug(); for_each_online_cpu(cpu) { cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD, diff --git a/drivers/input/input.c b/drivers/input/input.c index de2e7546b49..a90486f5e49 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -998,12 +998,13 @@ void input_unregister_device(struct input_dev *dev) sysfs_remove_group(&dev->cdev.kobj, &input_dev_caps_attr_group); sysfs_remove_group(&dev->cdev.kobj, &input_dev_id_attr_group); sysfs_remove_group(&dev->cdev.kobj, &input_dev_attr_group); - class_device_unregister(&dev->cdev); mutex_lock(&dev->mutex); dev->name = dev->phys = dev->uniq = NULL; mutex_unlock(&dev->mutex); + class_device_unregister(&dev->cdev); + input_wakeup_procfs_readers(); } EXPORT_SYMBOL(input_unregister_device); diff --git a/drivers/isdn/gigaset/common.c b/drivers/isdn/gigaset/common.c index acb7e265678..2a56bf33a67 100644 --- a/drivers/isdn/gigaset/common.c +++ b/drivers/isdn/gigaset/common.c @@ -981,7 +981,7 @@ exit: EXPORT_SYMBOL_GPL(gigaset_stop); static LIST_HEAD(drivers); -static spinlock_t driver_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(driver_lock); struct cardstate *gigaset_get_cs_by_id(int id) { diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c index 2ac90242d26..433389daedb 100644 --- a/drivers/isdn/i4l/isdn_tty.c +++ b/drivers/isdn/i4l/isdn_tty.c @@ -82,7 +82,7 @@ isdn_tty_try_read(modem_info * info, struct sk_buff *skb) int l = skb->len; unsigned char *dp = skb->data; while (--l) { - if (*skb->data == DLE) + if (*dp == DLE) tty_insert_flip_char(tty, DLE, 0); tty_insert_flip_char(tty, *dp++, 0); } diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index fe6541326c7..9b015f9af35 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -18,7 +18,7 @@ #include <linux/leds.h> #include "leds.h" -rwlock_t leds_list_lock = RW_LOCK_UNLOCKED; +DEFINE_RWLOCK(leds_list_lock); LIST_HEAD(leds_list); EXPORT_SYMBOL_GPL(leds_list); diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c index 5e2cd8be119..1b1ce652396 100644 --- a/drivers/leds/led-triggers.c +++ b/drivers/leds/led-triggers.c @@ -26,7 +26,7 @@ /* * Nests outside led_cdev->trigger_lock */ -static rwlock_t triggers_list_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(triggers_list_lock); static LIST_HEAD(trigger_list); ssize_t led_trigger_store(struct class_device *dev, const char *buf, diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c index 74714e5bcf0..3ff8378ea66 100644 --- a/drivers/message/fusion/mptfc.c +++ b/drivers/message/fusion/mptfc.c @@ -305,10 +305,8 @@ mptfc_GetFcDevPage0(MPT_ADAPTER *ioc, int ioc_port, } out: - if (pp0_array) - kfree(pp0_array); - if (p0_array) - kfree(p0_array); + kfree(pp0_array); + kfree(p0_array); return rc; } diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index af6ec553ff7..85689ab46cb 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -1378,8 +1378,7 @@ mptsas_probe_hba_phys(MPT_ADAPTER *ioc) return 0; out_free_port_info: - if (hba) - kfree(hba); + kfree(hba); out: return error; } diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c index febbdd4e060..c74e5460f83 100644 --- a/drivers/message/i2o/iop.c +++ b/drivers/message/i2o/iop.c @@ -1239,7 +1239,6 @@ EXPORT_SYMBOL(i2o_cntxt_list_remove); EXPORT_SYMBOL(i2o_cntxt_list_get_ptr); #endif EXPORT_SYMBOL(i2o_msg_get_wait); -EXPORT_SYMBOL(i2o_msg_nop); EXPORT_SYMBOL(i2o_find_iop); EXPORT_SYMBOL(i2o_iop_find_device); EXPORT_SYMBOL(i2o_event_register); diff --git a/drivers/misc/ibmasm/module.c b/drivers/misc/ibmasm/module.c index 1fdf03fd2da..9706cc19134 100644 --- a/drivers/misc/ibmasm/module.c +++ b/drivers/misc/ibmasm/module.c @@ -85,7 +85,7 @@ static int __devinit ibmasm_init_one(struct pci_dev *pdev, const struct pci_devi } memset(sp, 0, sizeof(struct service_processor)); - sp->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&sp->lock); INIT_LIST_HEAD(&sp->command_queue); pci_set_drvdata(pdev, (void *)sp); diff --git a/drivers/net/fs_enet/fs_enet-mii.c b/drivers/net/fs_enet/fs_enet-mii.c index c6770377ef8..0cd07150bf4 100644 --- a/drivers/net/fs_enet/fs_enet-mii.c +++ b/drivers/net/fs_enet/fs_enet-mii.c @@ -431,8 +431,7 @@ static struct fs_enet_mii_bus *create_bus(const struct fs_mii_bus_info *bi) return bus; err: - if (bus) - kfree(bus); + kfree(bus); return ERR_PTR(ret); } diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c index 081a8999666..a8a8f975432 100644 --- a/drivers/net/wireless/ipw2200.c +++ b/drivers/net/wireless/ipw2200.c @@ -1229,12 +1229,6 @@ static struct ipw_fw_error *ipw_alloc_error_log(struct ipw_priv *priv) return error; } -static void ipw_free_error_log(struct ipw_fw_error *error) -{ - if (error) - kfree(error); -} - static ssize_t show_event_log(struct device *d, struct device_attribute *attr, char *buf) { @@ -1296,10 +1290,9 @@ static ssize_t clear_error(struct device *d, const char *buf, size_t count) { struct ipw_priv *priv = dev_get_drvdata(d); - if (priv->error) { - ipw_free_error_log(priv->error); - priv->error = NULL; - } + + kfree(priv->error); + priv->error = NULL; return count; } @@ -1970,8 +1963,7 @@ static void ipw_irq_tasklet(struct ipw_priv *priv) struct ipw_fw_error *error = ipw_alloc_error_log(priv); ipw_dump_error_log(priv, error); - if (error) - ipw_free_error_log(error); + kfree(error); } #endif } else { @@ -11693,10 +11685,8 @@ static void ipw_pci_remove(struct pci_dev *pdev) } } - if (priv->error) { - ipw_free_error_log(priv->error); - priv->error = NULL; - } + kfree(priv->error); + priv->error = NULL; #ifdef CONFIG_IPW2200_PROMISCUOUS ipw_prom_free(priv); diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c index 0e07d953511..d0f68ab8f04 100644 --- a/drivers/pcmcia/m8xx_pcmcia.c +++ b/drivers/pcmcia/m8xx_pcmcia.c @@ -157,7 +157,7 @@ MODULE_LICENSE("Dual MPL/GPL"); static int pcmcia_schlvl = PCMCIA_SCHLVL; -static spinlock_t events_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(events_lock); #define PCMCIA_SOCKET_KEY_5V 1 @@ -644,7 +644,7 @@ static struct platform_device m8xx_device = { }; static u32 pending_events[PCMCIA_SOCKETS_NO]; -static spinlock_t pending_event_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(pending_event_lock); static irqreturn_t m8xx_interrupt(int irq, void *dev, struct pt_regs *regs) { diff --git a/drivers/rapidio/rio-access.c b/drivers/rapidio/rio-access.c index b9fab2ae3a3..8b56bbdd011 100644 --- a/drivers/rapidio/rio-access.c +++ b/drivers/rapidio/rio-access.c @@ -17,8 +17,8 @@ * These interrupt-safe spinlocks protect all accesses to RIO * configuration space and doorbell access. */ -static spinlock_t rio_config_lock = SPIN_LOCK_UNLOCKED; -static spinlock_t rio_doorbell_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(rio_config_lock); +static DEFINE_SPINLOCK(rio_doorbell_lock); /* * Wrappers for all RIO configuration access functions. They just check diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 5396beec30d..1cb61a761cb 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -94,7 +94,9 @@ exit_kfree: kfree(rtc); exit_idr: + mutex_lock(&idr_lock); idr_remove(&rtc_idr, id); + mutex_unlock(&idr_lock); exit: dev_err(dev, "rtc core: unable to register %s, err = %d\n", diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index ecafbad41a2..762521a1419 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -226,7 +226,7 @@ static int ds1553_rtc_ioctl(struct device *dev, unsigned int cmd, struct rtc_plat_data *pdata = platform_get_drvdata(pdev); if (pdata->irq < 0) - return -ENOIOCTLCMD; + return -ENOIOCTLCMD; /* fall back into rtc-dev's emulation */ switch (cmd) { case RTC_AIE_OFF: pdata->irqen &= ~RTC_AF; diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index ab486fbc828..9cd1cb304bb 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -45,7 +45,7 @@ static unsigned long rtc_freq = 1024; static struct rtc_time rtc_alarm; -static spinlock_t sa1100_rtc_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(sa1100_rtc_lock); static int rtc_update_alarm(struct rtc_time *alrm) { diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index 33e029207e2..4b9291dd444 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -93,7 +93,7 @@ static void __iomem *rtc2_base; static unsigned long epoch = 1970; /* Jan 1 1970 00:00:00 */ -static spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(rtc_lock); static char rtc_name[] = "RTC"; static unsigned long periodic_frequency; static unsigned long periodic_count; diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 2d946b6ca07..2d8af709947 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -89,7 +89,7 @@ struct eerbuffer { }; static LIST_HEAD(bufferlist); -static spinlock_t bufferlock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(bufferlock); static DECLARE_WAIT_QUEUE_HEAD(dasd_eer_read_wait_queue); /* diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index 6c66877be2b..855ce9a9d94 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -5733,7 +5733,7 @@ module_init(ata_init); module_exit(ata_exit); static unsigned long ratelimit_time; -static spinlock_t ata_ratelimit_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(ata_ratelimit_lock); int ata_ratelimit(void) { diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c index 93d18a74c40..2915bca691e 100644 --- a/drivers/scsi/libata-scsi.c +++ b/drivers/scsi/libata-scsi.c @@ -222,9 +222,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) && copy_to_user(arg + sizeof(args), argbuf, argsize)) rc = -EFAULT; error: - if (argbuf) - kfree(argbuf); - + kfree(argbuf); return rc; } diff --git a/drivers/sn/ioc3.c b/drivers/sn/ioc3.c index 501316b198e..ed946311d3a 100644 --- a/drivers/sn/ioc3.c +++ b/drivers/sn/ioc3.c @@ -26,7 +26,7 @@ static DECLARE_RWSEM(ioc3_devices_rwsem); static struct ioc3_submodule *ioc3_submodules[IOC3_MAX_SUBMODULES]; static struct ioc3_submodule *ioc3_ethernet; -static rwlock_t ioc3_submodules_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(ioc3_submodules_lock); /* NIC probing code */ diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c index d63c3f48585..9ef68cd83bb 100644 --- a/drivers/video/au1100fb.c +++ b/drivers/video/au1100fb.c @@ -743,8 +743,7 @@ void __exit au1100fb_cleanup(void) { driver_unregister(&au1100fb_driver); - if (drv_info.opt_mode) - kfree(drv_info.opt_mode); + kfree(drv_info.opt_mode); } module_init(au1100fb_init); diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c index a71e984c93d..ffc72ae3ada 100644 --- a/drivers/video/backlight/hp680_bl.c +++ b/drivers/video/backlight/hp680_bl.c @@ -27,7 +27,7 @@ static int hp680bl_suspended; static int current_intensity = 0; -static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(bl_lock); static struct backlight_device *hp680_backlight_device; static void hp680bl_send_intensity(struct backlight_device *bd) diff --git a/fs/buffer.c b/fs/buffer.c index 373bb6292bd..f23bb647db4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -564,7 +564,7 @@ still_busy: * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ -void end_buffer_async_write(struct buffer_head *bh, int uptodate) +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; unsigned long flags; @@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page); EXPORT_SYMBOL(block_truncate_page); EXPORT_SYMBOL(block_write_full_page); EXPORT_SYMBOL(cont_prepare_write); -EXPORT_SYMBOL(end_buffer_async_write); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 7f96b5cb678..8c9b28dff11 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -34,6 +34,7 @@ #include <linux/suspend.h> #include <linux/pagemap.h> #include <linux/kthread.h> +#include <linux/poison.h> #include <linux/proc_fs.h> #include <asm/uaccess.h> @@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh) { #ifdef CONFIG_JBD_DEBUG atomic_dec(&nr_journal_heads); - memset(jh, 0x5b, sizeof(*jh)); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); #endif kmem_cache_free(journal_head_cache, jh); } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 402005c35ab..8ca9707be6c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -909,7 +909,7 @@ int __init nfs_init_directcache(void) * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures * */ -void __exit nfs_destroy_directcache(void) +void nfs_destroy_directcache(void) { if (kmem_cache_destroy(nfs_direct_cachep)) printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 51bc88b662f..c5b916605fb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1132,7 +1132,7 @@ static int __init nfs_init_inodecache(void) return 0; } -static void __exit nfs_destroy_inodecache(void) +static void nfs_destroy_inodecache(void) { if (kmem_cache_destroy(nfs_inode_cachep)) printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n"); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index bd2815e2dec..4fe51c1292b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -31,15 +31,15 @@ extern struct svc_version nfs4_callback_version1; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); -extern void __exit nfs_destroy_nfspagecache(void); +extern void nfs_destroy_nfspagecache(void); extern int __init nfs_init_readpagecache(void); -extern void __exit nfs_destroy_readpagecache(void); +extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); -extern void __exit nfs_destroy_writepagecache(void); +extern void nfs_destroy_writepagecache(void); #ifdef CONFIG_NFS_DIRECTIO extern int __init nfs_init_directcache(void); -extern void __exit nfs_destroy_directcache(void); +extern void nfs_destroy_directcache(void); #else #define nfs_init_directcache() (0) #define nfs_destroy_directcache() do {} while(0) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ef9429643eb..d89f6fb3b3a 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -390,7 +390,7 @@ int __init nfs_init_nfspagecache(void) return 0; } -void __exit nfs_destroy_nfspagecache(void) +void nfs_destroy_nfspagecache(void) { if (kmem_cache_destroy(nfs_page_cachep)) printk(KERN_INFO "nfs_page: not all structures were freed\n"); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 41c2ffee24f..32cf3773af0 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -711,7 +711,7 @@ int __init nfs_init_readpagecache(void) return 0; } -void __exit nfs_destroy_readpagecache(void) +void nfs_destroy_readpagecache(void) { mempool_destroy(nfs_rdata_mempool); if (kmem_cache_destroy(nfs_rdata_cachep)) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b383fdd3a15..8fccb9cb173 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1551,7 +1551,7 @@ int __init nfs_init_writepagecache(void) return 0; } -void __exit nfs_destroy_writepagecache(void) +void nfs_destroy_writepagecache(void) { mempool_destroy(nfs_commit_mempool); mempool_destroy(nfs_wdata_mempool); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1630b5670dc..7c7d01672d3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags); */ /* recall_lock protects the del_recall_lru */ -static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(recall_lock); static struct list_head del_recall_lru; static void diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 21f38accd03..1d26cfcd9f8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem); * multiple hb threads are watching multiple regions. A node is live * whenever any of the threads sees activity from the node in its region. */ -static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(o2hb_live_lock); static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0f60cc0d398..1591eb37a72 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ ##args); \ } while (0) -static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(o2net_handler_lock); static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index ba27c5c5e95..b8c23f7ba67 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -88,7 +88,7 @@ out_free: * */ -spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(dlm_domain_lock); LIST_HEAD(dlm_domains); static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index d6f89577e25..5ca57ec650c 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -53,7 +53,7 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index da399013516..29b2845f370 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); static u64 dlm_get_next_mig_cookie(void); -static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; -static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(dlm_reco_state_lock); +static DEFINE_SPINLOCK(dlm_mig_cookie_lock); static u64 dlm_mig_cookie = 1; static u64 dlm_get_next_mig_cookie(void) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64cd52860c8..4acd37286bd 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type, mlog_exit_void(); } -static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, struct ocfs2_dlm_debug *dlm_debug) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3fe8781c22c..910a601b2e9 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -49,7 +49,7 @@ #include "buffer_head_io.h" -spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(trans_inc_lock); static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index ee42765a855..cf70fe2075b 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb) } bail: - if (request) - kfree(request); - + kfree(request); return status; } @@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) } bail: - if (request) - kfree(request); - + kfree(request); return status; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0137ec4c136..0a163a4f776 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -122,6 +122,11 @@ struct mem_size_stats unsigned long private_dirty; }; +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { struct proc_maps_private *priv = m->private; @@ -158,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats pad_len_spaces(m, len); seq_path(m, file->f_vfsmnt, file->f_dentry, "\n"); } else { - if (mm) { - if (vma->vm_start <= mm->start_brk && + const char *name = arch_vma_name(vma); + if (!name) { + if (mm) { + if (vma->vm_start <= mm->start_brk && vma->vm_end >= mm->brk) { - pad_len_spaces(m, len); - seq_puts(m, "[heap]"); - } else { - if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - - pad_len_spaces(m, len); - seq_puts(m, "[stack]"); + name = "[heap]"; + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + name = "[stack]"; } + } else { + name = "[vdso]"; } - } else { + } + if (name) { pad_len_spaces(m, len); - seq_puts(m, "[vdso]"); + seq_puts(m, name); } } seq_putc(m, '\n'); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f2dbdf5a876..259bd196099 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -605,39 +605,12 @@ static void ufs_set_inode_ops(struct inode *inode) ufs_get_inode_dev(inode->i_sb, UFS_I(inode))); } -void ufs_read_inode (struct inode * inode) +static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_inode * ufs_inode; - struct ufs2_inode *ufs2_inode; - struct buffer_head * bh; + struct super_block *sb = inode->i_sb; mode_t mode; unsigned i; - unsigned flags; - - UFSD("ENTER, ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - flags = UFS_SB(sb)->s_flags; - - if (inode->i_ino < UFS_ROOTINO || - inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { - ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); - goto bad_inode; - } - - bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); - if (!bh) { - ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); - goto bad_inode; - } - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - goto ufs2_inode; - - ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino)); /* * Copy data to the in-core inode. @@ -661,14 +634,11 @@ void ufs_read_inode (struct inode * inode) inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat) */ - inode->i_version++; ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; - ufsi->i_dir_start_lookup = 0; + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) @@ -677,24 +647,16 @@ void ufs_read_inode (struct inode * inode) for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i]; } - ufsi->i_osync = 0; - - ufs_set_inode_ops(inode); - - brelse (bh); - - UFSD("EXIT\n"); - return; +} -bad_inode: - make_bad_inode(inode); - return; +static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + mode_t mode; + unsigned i; -ufs2_inode : UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); - - ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino)); - /* * Copy data to the in-core inode. */ @@ -717,26 +679,64 @@ ufs2_inode : inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); - inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/ - - inode->i_version++; ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen); /* ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); */ - ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift; if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) ufsi->i_u1.u2_i_data[i] = ufs2_inode->ui_u2.ui_addr.ui_db[i]; - } - else { + } else { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i]; } +} + +void ufs_read_inode(struct inode * inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct buffer_head * bh; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + if (inode->i_ino < UFS_ROOTINO || + inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n", + inode->i_ino); + goto bad_inode; + } + + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + inode->i_ino); + goto bad_inode; + } + if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + ufs2_read_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data; + + ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/ + inode->i_version++; + ufsi->i_lastfrag = + (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; + ufsi->i_dir_start_lookup = 0; ufsi->i_osync = 0; ufs_set_inode_ops(inode); @@ -745,6 +745,9 @@ ufs2_inode : UFSD("EXIT\n"); return; + +bad_inode: + make_bad_inode(inode); } static int ufs_update_inode(struct inode * inode, int do_sync) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 10dbf203c62..ed7579beb6b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit( * is present to prevent thrashing). */ +#ifdef CONFIG_HOTPLUG_CPU /* * hot-plug CPU notifier support. * - * We cannot use the hotcpu_register() function because it does - * not allow notifier instances. We need a notifier per filesystem - * as we need to be able to identify the filesystem to balance - * the counters out. This is achieved by having a notifier block - * embedded in the xfs_mount_t and doing pointer magic to get the - * mount pointer from the notifier block address. + * We need a notifier per filesystem as we need to be able to identify + * the filesystem to balance the counters out. This is achieved by + * having a notifier block embedded in the xfs_mount_t and doing pointer + * magic to get the mount pointer from the notifier block address. */ STATIC int xfs_icsb_cpu_notify( @@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify( return NOTIFY_OK; } +#endif /* CONFIG_HOTPLUG_CPU */ int xfs_icsb_init_counters( @@ -1791,9 +1791,11 @@ xfs_icsb_init_counters( if (mp->m_sb_cnts == NULL) return -ENOMEM; +#ifdef CONFIG_HOTPLUG_CPU mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; mp->m_icsb_notifier.priority = 0; - register_cpu_notifier(&mp->m_icsb_notifier); + register_hotcpu_notifier(&mp->m_icsb_notifier); +#endif /* CONFIG_HOTPLUG_CPU */ for_each_online_cpu(i) { cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); @@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters( xfs_mount_t *mp) { if (mp->m_sb_cnts) { - unregister_cpu_notifier(&mp->m_icsb_notifier); + unregister_hotcpu_notifier(&mp->m_icsb_notifier); free_percpu(mp->m_sb_cnts); } } diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h index dba70c62a16..457c34b6eb0 100644 --- a/include/asm-alpha/core_t2.h +++ b/include/asm-alpha/core_t2.h @@ -435,7 +435,7 @@ static inline void t2_outl(u32 b, unsigned long addr) set_hae(msb); \ } -static spinlock_t t2_hae_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(t2_hae_lock); __EXTERN_INLINE u8 t2_readb(const volatile void __iomem *xaddr) { diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 845cb67ad8e..8ceab7bcd8b 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -51,4 +51,10 @@ __ret; \ }) +#ifdef CONFIG_SMP +# define WARN_ON_SMP(x) WARN_ON(x) +#else +# define WARN_ON_SMP(x) do { } while (0) +#endif + #endif diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h index e7252c216ca..b1bc7b1b64b 100644 --- a/include/asm-i386/cpu.h +++ b/include/asm-i386/cpu.h @@ -7,8 +7,6 @@ #include <linux/nodemask.h> #include <linux/percpu.h> -#include <asm/node.h> - struct i386_cpu { struct cpu cpu; }; diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index 4153d80e4d2..1eac92cb5b1 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -10,6 +10,7 @@ #include <asm/processor.h> #include <asm/system.h> /* for savesegment */ #include <asm/auxvec.h> +#include <asm/desc.h> #include <linux/utsname.h> @@ -129,15 +130,41 @@ extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs) #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs) -#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) -#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) -#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) +#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) +#define VDSO_BASE ((unsigned long)current->mm->context.vdso) + +#ifdef CONFIG_COMPAT_VDSO +# define VDSO_COMPAT_BASE VDSO_HIGH_BASE +# define VDSO_PRELINK VDSO_HIGH_BASE +#else +# define VDSO_COMPAT_BASE VDSO_BASE +# define VDSO_PRELINK 0 +#endif + +#define VDSO_COMPAT_SYM(x) \ + (VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK) + +#define VDSO_SYM(x) \ + (VDSO_BASE + (unsigned long)(x) - VDSO_PRELINK) + +#define VDSO_HIGH_EHDR ((const struct elfhdr *) VDSO_HIGH_BASE) +#define VDSO_EHDR ((const struct elfhdr *) VDSO_COMPAT_BASE) + extern void __kernel_vsyscall; +#define VDSO_ENTRY VDSO_SYM(&__kernel_vsyscall) + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int executable_stack); + +extern unsigned int vdso_enabled; + #define ARCH_DLINFO \ -do { \ - NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ +do if (vdso_enabled) { \ + NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE); \ } while (0) /* @@ -148,15 +175,15 @@ do { \ * Dumping its extra ELF program headers includes all the other information * a debugger needs to easily find how the vsyscall DSO was being used. */ -#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) +#define ELF_CORE_EXTRA_PHDRS (VDSO_HIGH_EHDR->e_phnum) #define ELF_CORE_WRITE_EXTRA_PHDRS \ do { \ const struct elf_phdr *const vsyscall_phdrs = \ - (const struct elf_phdr *) (VSYSCALL_BASE \ - + VSYSCALL_EHDR->e_phoff); \ + (const struct elf_phdr *) (VDSO_HIGH_BASE \ + + VDSO_HIGH_EHDR->e_phoff); \ int i; \ Elf32_Off ofs = 0; \ - for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ + for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) { \ struct elf_phdr phdr = vsyscall_phdrs[i]; \ if (phdr.p_type == PT_LOAD) { \ BUG_ON(ofs != 0); \ @@ -174,10 +201,10 @@ do { \ #define ELF_CORE_WRITE_EXTRA_DATA \ do { \ const struct elf_phdr *const vsyscall_phdrs = \ - (const struct elf_phdr *) (VSYSCALL_BASE \ - + VSYSCALL_EHDR->e_phoff); \ + (const struct elf_phdr *) (VDSO_HIGH_BASE \ + + VDSO_HIGH_EHDR->e_phoff); \ int i; \ - for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ + for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) { \ if (vsyscall_phdrs[i].p_type == PT_LOAD) \ DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h index f7e068f4d2f..a48cc3f7ccc 100644 --- a/include/asm-i386/fixmap.h +++ b/include/asm-i386/fixmap.h @@ -51,7 +51,7 @@ */ enum fixed_addresses { FIX_HOLE, - FIX_VSYSCALL, + FIX_VDSO, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -115,14 +115,6 @@ extern void __set_fixmap (enum fixed_addresses idx, #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) -/* - * This is the range that is readable by user mode, and things - * acting like user mode such as get_user_pages. - */ -#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) - - extern void __this_fixmap_does_not_exist(void); /* diff --git a/include/asm-i386/mmu.h b/include/asm-i386/mmu.h index f431a0b86d4..8358dd3df7a 100644 --- a/include/asm-i386/mmu.h +++ b/include/asm-i386/mmu.h @@ -12,6 +12,7 @@ typedef struct { int size; struct semaphore sem; void *ldt; + void *vdso; } mm_context_t; #endif diff --git a/include/asm-i386/node.h b/include/asm-i386/node.h deleted file mode 100644 index e13c6ffa72a..00000000000 --- a/include/asm-i386/node.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _ASM_I386_NODE_H_ -#define _ASM_I386_NODE_H_ - -#include <linux/device.h> -#include <linux/mmzone.h> -#include <linux/node.h> -#include <linux/topology.h> -#include <linux/nodemask.h> - -struct i386_node { - struct node node; -}; -extern struct i386_node node_devices[MAX_NUMNODES]; - -static inline int arch_register_node(int num){ - int p_node; - struct node *parent = NULL; - - if (!node_online(num)) - return 0; - p_node = parent_node(num); - - if (p_node != num) - parent = &node_devices[p_node].node; - - return register_node(&node_devices[num].node, num, parent); -} - -#endif /* _ASM_I386_NODE_H_ */ diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index e3a552fa553..f5bf544c729 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -96,6 +96,8 @@ typedef struct { unsigned long pgprot; } pgprot_t; #ifndef __ASSEMBLY__ +struct vm_area_struct; + /* * This much address space is reserved for vmalloc() and iomap() * as well as fixmap mappings. @@ -139,6 +141,7 @@ extern int page_is_ram(unsigned long pagenr); #include <asm-generic/memory_model.h> #include <asm-generic/page.h> +#define __HAVE_ARCH_GATE_AREA 1 #endif /* __KERNEL__ */ #endif /* _I386_PAGE_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 55ea992da32..b32346d62e1 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -71,8 +71,12 @@ struct cpuinfo_x86 { cpumask_t llc_shared_map; /* cpus sharing the last level cache */ #endif unsigned char x86_max_cores; /* cpuid returned max cores value */ - unsigned char booted_cores; /* number of cores as seen by OS */ unsigned char apicid; +#ifdef CONFIG_SMP + unsigned char booted_cores; /* number of cores as seen by OS */ + __u8 phys_proc_id; /* Physical processor id. */ + __u8 cpu_core_id; /* Core id */ +#endif } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -104,8 +108,6 @@ extern struct cpuinfo_x86 cpu_data[]; #define current_cpu_data boot_cpu_data #endif -extern int phys_proc_id[NR_CPUS]; -extern int cpu_core_id[NR_CPUS]; extern int cpu_llc_id[NR_CPUS]; extern char ignore_fpu_irq; diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h index fdbc7f422ea..2833fa2c0dd 100644 --- a/include/asm-i386/thread_info.h +++ b/include/asm-i386/thread_info.h @@ -37,6 +37,7 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + void *sysenter_return; struct restart_block restart_block; unsigned long previous_esp; /* ESP of the previous stack in case @@ -83,17 +84,15 @@ struct thread_info { #define init_stack (init_thread_union.stack) +/* how to get the current stack pointer from C */ +register unsigned long current_stack_pointer asm("esp") __attribute_used__; + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); - return ti; + return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } -/* how to get the current stack pointer from C */ -register unsigned long current_stack_pointer asm("esp") __attribute_used__; - /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE #define alloc_thread_info(tsk) \ diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index b94e5eeef91..6adbd9b1ae8 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h @@ -28,10 +28,8 @@ #define _ASM_I386_TOPOLOGY_H #ifdef CONFIG_X86_HT -#define topology_physical_package_id(cpu) \ - (phys_proc_id[cpu] == BAD_APICID ? -1 : phys_proc_id[cpu]) -#define topology_core_id(cpu) \ - (cpu_core_id[cpu] == BAD_APICID ? 0 : cpu_core_id[cpu]) +#define topology_physical_package_id(cpu) (cpu_data[cpu].phys_proc_id) +#define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id) #define topology_core_siblings(cpu) (cpu_core_map[cpu]) #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) #endif @@ -114,4 +112,9 @@ extern unsigned long node_remap_size[]; extern cpumask_t cpu_coregroup_map(int cpu); +#ifdef CONFIG_SMP +#define mc_capable() (boot_cpu_data.x86_max_cores > 1) +#define smt_capable() (smp_num_siblings > 1) +#endif + #endif /* _ASM_I386_TOPOLOGY_H */ diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index d480f2e3821..69f0f1df672 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -78,8 +78,8 @@ static inline int arch_unw_user_mode(const struct unwind_frame_info *info) return user_mode_vm(&info->regs); #else return info->regs.eip < PAGE_OFFSET - || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL) - && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE) + || (info->regs.eip >= __fix_to_virt(FIX_VDSO) + && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE) || info->regs.esp < PAGE_OFFSET; #endif } diff --git a/include/asm-ia64/nodedata.h b/include/asm-ia64/nodedata.h index a140310bf84..2fb337b0e9b 100644 --- a/include/asm-ia64/nodedata.h +++ b/include/asm-ia64/nodedata.h @@ -46,6 +46,18 @@ struct ia64_node_data { */ #define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid]) +/* + * LOCAL_DATA_ADDR - This is to calculate the address of other node's + * "local_node_data" at hot-plug phase. The local_node_data + * is pointed by per_cpu_page. Kernel usually use it for + * just executing cpu. However, when new node is hot-added, + * the addresses of local data for other nodes are necessary + * to update all of them. + */ +#define LOCAL_DATA_ADDR(pgdat) \ + ((struct ia64_node_data *)((u64)(pgdat) + \ + L1_CACHE_ALIGN(sizeof(struct pglist_data)))) + #endif /* CONFIG_NUMA */ #endif /* _ASM_IA64_NODEDATA_H */ diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index 616b5ed2aa7..937c2125752 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h @@ -112,6 +112,7 @@ void build_cpu_to_node_map(void); #define topology_core_id(cpu) (cpu_data(cpu)->core_id) #define topology_core_siblings(cpu) (cpu_core_map[cpu]) #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) +#define smt_capable() (smp_num_siblings > 1) #endif #include <asm-generic/topology.h> diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index 92f3e5507d2..bbc3844b086 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h @@ -93,5 +93,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev, #endif /* CONFIG_NUMA */ +#ifdef CONFIG_SMP +#include <asm/cputable.h> +#define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TOPOLOGY_H */ diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h index 0e234e201bd..98a6c613589 100644 --- a/include/asm-sparc64/topology.h +++ b/include/asm-sparc64/topology.h @@ -1,6 +1,9 @@ #ifndef _ASM_SPARC64_TOPOLOGY_H #define _ASM_SPARC64_TOPOLOGY_H +#include <asm/spitfire.h> +#define smt_capable() (tlb_type == hypervisor) + #include <asm-generic/topology.h> #endif /* _ASM_SPARC64_TOPOLOGY_H */ diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h index 1b2ac55d320..93187746278 100644 --- a/include/asm-x86_64/hw_irq.h +++ b/include/asm-x86_64/hw_irq.h @@ -124,7 +124,7 @@ asmlinkage void IRQ_NAME(nr); \ __asm__( \ "\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ - "push $" #nr "-256 ; " \ + "push $~(" #nr ") ; " \ "jmp common_interrupt"); #if defined(CONFIG_X86_IO_APIC) diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index c4e46e7fa7b..6e7a2e976b0 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -59,6 +59,8 @@ extern int __node_distance(int, int); #define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id) #define topology_core_siblings(cpu) (cpu_core_map[cpu]) #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) +#define mc_capable() (boot_cpu_data.x86_max_cores > 1) +#define smt_capable() (smp_num_siblings > 1) #endif #include <asm-generic/topology.h> diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 90d6df1551e..88b5dfd8ee1 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -528,12 +528,18 @@ static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; } #ifdef CONFIG_ACPI_NUMA int acpi_get_pxm(acpi_handle handle); +int acpi_get_node(acpi_handle *handle); #else static inline int acpi_get_pxm(acpi_handle handle) { return 0; } +static inline int acpi_get_node(acpi_handle *handle) +{ + return 0; +} #endif +extern int acpi_paddr_to_node(u64 start_addr, u64 size); extern int pnpacpi_disabled; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index fb7e9b7ccbe..737e407d0cd 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -149,7 +149,6 @@ void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); @@ -214,6 +213,7 @@ int nobh_truncate_page(struct address_space *, loff_t); int nobh_writepage(struct page *page, get_block_t *get_block, struct writeback_control *wbc); +void buffer_init(void); /* * inline definitions diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 08d50c53aab..a3caf6866ba 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -31,17 +31,23 @@ struct cpu { struct sys_device sysdev; }; -extern int register_cpu(struct cpu *, int, struct node *); +extern int register_cpu(struct cpu *cpu, int num); extern struct sys_device *get_cpu_sysdev(unsigned cpu); #ifdef CONFIG_HOTPLUG_CPU -extern void unregister_cpu(struct cpu *, struct node *); +extern void unregister_cpu(struct cpu *cpu); #endif struct notifier_block; #ifdef CONFIG_SMP /* Need to know about CPUs going up/down? */ extern int register_cpu_notifier(struct notifier_block *nb); +#ifdef CONFIG_HOTPLUG_CPU extern void unregister_cpu_notifier(struct notifier_block *nb); +#else +static inline void unregister_cpu_notifier(struct notifier_block *nb) +{ +} +#endif extern int current_in_cpu_hotplug(void); int cpu_up(unsigned int cpu); @@ -73,6 +79,8 @@ extern int lock_cpu_hotplug_interruptible(void); { .notifier_call = fn, .priority = pri }; \ register_cpu_notifier(&fn##_nb); \ } +#define register_hotcpu_notifier(nb) register_cpu_notifier(nb) +#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) int cpu_down(unsigned int cpu); #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #else @@ -80,6 +88,8 @@ int cpu_down(unsigned int cpu); #define unlock_cpu_hotplug() do { } while (0) #define lock_cpu_hotplug_interruptible() 0 #define hotcpu_notifier(fn, pri) +#define register_hotcpu_notifier(nb) +#define unregister_hotcpu_notifier(nb) /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */ static inline int cpu_is_offline(int cpu) { return 0; } diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 78b236ca04f..272010a6078 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -20,7 +20,7 @@ */ #ifndef DMAENGINE_H #define DMAENGINE_H -#include <linux/config.h> + #ifdef CONFIG_DMA_ENGINE #include <linux/device.h> diff --git a/include/linux/futex.h b/include/linux/futex.h index 966a5b3da43..34c3a215f2c 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -12,6 +12,9 @@ #define FUTEX_REQUEUE 3 #define FUTEX_CMP_REQUEUE 4 #define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 /* * Support for robust futexes: the kernel cleans up held futexes at @@ -90,18 +93,21 @@ struct robust_list_head { */ #define ROBUST_LIST_LIMIT 2048 -long do_futex(unsigned long uaddr, int op, int val, - unsigned long timeout, unsigned long uaddr2, int val2, - int val3); +long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, + u32 __user *uaddr2, u32 val2, u32 val3); extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr); #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); +extern void exit_pi_state_list(struct task_struct *curr); #else static inline void exit_robust_list(struct task_struct *curr) { } +static inline void exit_pi_state_list(struct task_struct *curr) +{ +} #endif #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e127ef7e8da..3a256957fb5 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -87,6 +87,7 @@ extern struct group_info init_groups; .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ @@ -122,6 +123,8 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ + .pi_lock = SPIN_LOCK_UNLOCKED, \ + INIT_RT_MUTEXES(tsk) \ } diff --git a/include/linux/ioport.h b/include/linux/ioport.h index cd6bd001ba4..edfc733b157 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -105,6 +105,9 @@ extern int allocate_resource(struct resource *root, struct resource *new, int adjust_resource(struct resource *res, unsigned long start, unsigned long size); +/* get registered SYSTEM_RAM resources in specified area */ +extern int find_next_system_ram(struct resource *res); + /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name)) #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name)) diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 5653b2f23b6..d09fbeabf1d 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -210,11 +210,7 @@ struct kernel_ipmi_msg #include <linux/list.h> #include <linux/module.h> #include <linux/device.h> - -#ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> -extern struct proc_dir_entry *proc_ipmi_root; -#endif /* CONFIG_PROC_FS */ /* Opaque type for a IPMI message user. One of these is needed to send and receive messages. */ diff --git a/include/linux/list.h b/include/linux/list.h index 37ca31b21bb..6b74adf5297 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -4,18 +4,11 @@ #ifdef __KERNEL__ #include <linux/stddef.h> +#include <linux/poison.h> #include <linux/prefetch.h> #include <asm/system.h> /* - * These are non-NULL pointers that will result in page faults - * under normal circumstances, used to verify that nobody uses - * non-initialized list entries. - */ -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) - -/* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 91120638617..218501cfaeb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -63,6 +63,76 @@ extern int online_pages(unsigned long, unsigned long); /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); + +#ifdef CONFIG_NUMA +extern int memory_add_physaddr_to_nid(u64 start); +#else +static inline int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +#endif + +#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION +/* + * For supporting node-hotadd, we have to allocate a new pgdat. + * + * If an arch has generic style NODE_DATA(), + * node_data[nid] = kzalloc() works well. But it depends on the architecture. + * + * In general, generic_alloc_nodedata() is used. + * Now, arch_free_nodedata() is just defined for error path of node_hot_add. + * + */ +extern pg_data_t *arch_alloc_nodedata(int nid); +extern void arch_free_nodedata(pg_data_t *pgdat); +extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); + +#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + +#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) +#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) + +#ifdef CONFIG_NUMA +/* + * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat. + * XXX: kmalloc_node() can't work well to get new node's memory at this time. + * Because, pgdat for the new node is not allocated/initialized yet itself. + * To use new node's memory, more consideration will be necessary. + */ +#define generic_alloc_nodedata(nid) \ +({ \ + kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ +}) +/* + * This definition is just for error path in node hotadd. + * For node hotremove, we have to replace this. + */ +#define generic_free_nodedata(pgdat) kfree(pgdat) + +extern pg_data_t *node_data[]; +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ + node_data[nid] = pgdat; +} + +#else /* !CONFIG_NUMA */ + +/* never called */ +static inline pg_data_t *generic_alloc_nodedata(int nid) +{ + BUG(); + return NULL; +} +static inline void generic_free_nodedata(pg_data_t *pgdat) +{ +} +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -99,7 +169,8 @@ static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, return -ENOSYS; } -extern int add_memory(u64 start, u64 size); +extern int add_memory(int nid, u64 start, u64 size); +extern int arch_add_memory(int nid, u64 start, u64 size); extern int remove_memory(u64 start, u64 size); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a929ea197e4..c41a1299b8c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1030,13 +1030,20 @@ static inline void vm_stat_account(struct mm_struct *mm, } #endif /* CONFIG_PROC_FS */ +static inline void +debug_check_no_locks_freed(const void *from, unsigned long len) +{ + mutex_debug_check_no_locks_freed(from, len); + rt_mutex_debug_check_no_locks_freed(from, len); +} + #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) { if (!PageHighMem(page) && !enable) - mutex_debug_check_no_locks_freed(page_address(page), - numpages * PAGE_SIZE); + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); } #endif @@ -1065,5 +1072,7 @@ void drop_slab(void); extern int randomize_va_space; #endif +const char *arch_vma_name(struct vm_area_struct *vma); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/node.h b/include/linux/node.h index 254dc3de650..81dcec84cd8 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -26,8 +26,25 @@ struct node { struct sys_device sysdev; }; +extern struct node node_devices[]; + extern int register_node(struct node *, int, struct node *); extern void unregister_node(struct node *node); +extern int register_one_node(int nid); +extern void unregister_one_node(int nid); +#ifdef CONFIG_NUMA +extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); +extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); +#else +static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid) +{ + return 0; +} +static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) +{ + return 0; +} +#endif #define to_node(sys_device) container_of(sys_device, struct node, sysdev) diff --git a/include/linux/nsc_gpio.h b/include/linux/nsc_gpio.h new file mode 100644 index 00000000000..135742cfada --- /dev/null +++ b/include/linux/nsc_gpio.h @@ -0,0 +1,42 @@ +/** + nsc_gpio.c + + National Semiconductor GPIO common access methods. + + struct nsc_gpio_ops abstracts the low-level access + operations for the GPIO units on 2 NSC chip families; the GEODE + integrated CPU, and the PC-8736[03456] integrated PC-peripheral + chips. + + The GPIO units on these chips have the same pin architecture, but + the access methods differ. Thus, scx200_gpio and pc8736x_gpio + implement their own versions of these routines; and use the common + file-operations routines implemented in nsc_gpio module. + + Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com> + + NB: this work was tested on the Geode SC-1100 and PC-87366 chips. + NSC sold the GEODE line to AMD, and the PC-8736x line to Winbond. +*/ + +struct nsc_gpio_ops { + struct module* owner; + u32 (*gpio_config) (unsigned iminor, u32 mask, u32 bits); + void (*gpio_dump) (struct nsc_gpio_ops *amp, unsigned iminor); + int (*gpio_get) (unsigned iminor); + void (*gpio_set) (unsigned iminor, int state); + void (*gpio_set_high)(unsigned iminor); + void (*gpio_set_low) (unsigned iminor); + void (*gpio_change) (unsigned iminor); + int (*gpio_current) (unsigned iminor); + struct device* dev; /* for dev_dbg() support, set in init */ +}; + +extern ssize_t nsc_gpio_write(struct file *file, const char __user *data, + size_t len, loff_t *ppos); + +extern ssize_t nsc_gpio_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos); + +extern void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index); + diff --git a/include/linux/plist.h b/include/linux/plist.h new file mode 100644 index 00000000000..3404faef542 --- /dev/null +++ b/include/linux/plist.h @@ -0,0 +1,247 @@ +/* + * Descending-priority-sorted double-linked list + * + * (C) 2002-2003 Intel Corp + * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. + * + * 2001-2005 (c) MontaVista Software, Inc. + * Daniel Walker <dwalker@mvista.com> + * + * (C) 2005 Thomas Gleixner <tglx@linutronix.de> + * + * Simplifications of the original code by + * Oleg Nesterov <oleg@tv-sign.ru> + * + * Licensed under the FSF's GNU Public License v2 or later. + * + * Based on simple lists (include/linux/list.h). + * + * This is a priority-sorted list of nodes; each node has a + * priority from INT_MIN (highest) to INT_MAX (lowest). + * + * Addition is O(K), removal is O(1), change of priority of a node is + * O(K) and K is the number of RT priority levels used in the system. + * (1 <= K <= 99) + * + * This list is really a list of lists: + * + * - The tier 1 list is the prio_list, different priority nodes. + * + * - The tier 2 list is the node_list, serialized nodes. + * + * Simple ASCII art explanation: + * + * |HEAD | + * | | + * |prio_list.prev|<------------------------------------| + * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-| + * |10 | |10| |21| |21| |21| |40| (prio) + * | | | | | | | | | | | | + * | | | | | | | | | | | | + * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| + * |node_list.prev|<------------------------------------| + * + * The nodes on the prio_list list are sorted by priority to simplify + * the insertion of new nodes. There are no nodes with duplicate + * priorites on the list. + * + * The nodes on the node_list is ordered by priority and can contain + * entries which have the same priority. Those entries are ordered + * FIFO + * + * Addition means: look for the prio_list node in the prio_list + * for the priority of the node and insert it before the node_list + * entry of the next prio_list node. If it is the first node of + * that priority, add it to the prio_list in the right position and + * insert it into the serialized node_list list + * + * Removal means remove it from the node_list and remove it from + * the prio_list if the node_list list_head is non empty. In case + * of removal from the prio_list it must be checked whether other + * entries of the same priority are on the list or not. If there + * is another entry of the same priority then this entry has to + * replace the removed entry on the prio_list. If the entry which + * is removed is the only entry of this priority then a simple + * remove from both list is sufficient. + * + * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX + * is lowest priority. + * + * No locking is done, up to the caller. + * + */ +#ifndef _LINUX_PLIST_H_ +#define _LINUX_PLIST_H_ + +#include <linux/list.h> +#include <linux/spinlock_types.h> + +struct plist_head { + struct list_head prio_list; + struct list_head node_list; +#ifdef CONFIG_DEBUG_PI_LIST + spinlock_t *lock; +#endif +}; + +struct plist_node { + int prio; + struct plist_head plist; +}; + +#ifdef CONFIG_DEBUG_PI_LIST +# define PLIST_HEAD_LOCK_INIT(_lock) .lock = _lock +#else +# define PLIST_HEAD_LOCK_INIT(_lock) +#endif + +/** + * #PLIST_HEAD_INIT - static struct plist_head initializer + * + * @head: struct plist_head variable name + */ +#define PLIST_HEAD_INIT(head, _lock) \ +{ \ + .prio_list = LIST_HEAD_INIT((head).prio_list), \ + .node_list = LIST_HEAD_INIT((head).node_list), \ + PLIST_HEAD_LOCK_INIT(&(_lock)) \ +} + +/** + * #PLIST_NODE_INIT - static struct plist_node initializer + * + * @node: struct plist_node variable name + * @__prio: initial node priority + */ +#define PLIST_NODE_INIT(node, __prio) \ +{ \ + .prio = (__prio), \ + .plist = PLIST_HEAD_INIT((node).plist, NULL), \ +} + +/** + * plist_head_init - dynamic struct plist_head initializer + * + * @head: &struct plist_head pointer + */ +static inline void +plist_head_init(struct plist_head *head, spinlock_t *lock) +{ + INIT_LIST_HEAD(&head->prio_list); + INIT_LIST_HEAD(&head->node_list); +#ifdef CONFIG_DEBUG_PI_LIST + head->lock = lock; +#endif +} + +/** + * plist_node_init - Dynamic struct plist_node initializer + * + * @node: &struct plist_node pointer + * @prio: initial node priority + */ +static inline void plist_node_init(struct plist_node *node, int prio) +{ + node->prio = prio; + plist_head_init(&node->plist, NULL); +} + +extern void plist_add(struct plist_node *node, struct plist_head *head); +extern void plist_del(struct plist_node *node, struct plist_head *head); + +/** + * plist_for_each - iterate over the plist + * + * @pos1: the type * to use as a loop counter. + * @head: the head for your list. + */ +#define plist_for_each(pos, head) \ + list_for_each_entry(pos, &(head)->node_list, plist.node_list) + +/** + * plist_for_each_entry_safe - iterate over a plist of given type safe + * against removal of list entry + * + * @pos1: the type * to use as a loop counter. + * @n1: another type * to use as temporary storage + * @head: the head for your list. + */ +#define plist_for_each_safe(pos, n, head) \ + list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list) + +/** + * plist_for_each_entry - iterate over list of given type + * + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define plist_for_each_entry(pos, head, mem) \ + list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list) + +/** + * plist_for_each_entry_safe - iterate over list of given type safe against + * removal of list entry + * + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @m: the name of the list_struct within the struct. + */ +#define plist_for_each_entry_safe(pos, n, head, m) \ + list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list) + +/** + * plist_head_empty - return !0 if a plist_head is empty + * + * @head: &struct plist_head pointer + */ +static inline int plist_head_empty(const struct plist_head *head) +{ + return list_empty(&head->node_list); +} + +/** + * plist_node_empty - return !0 if plist_node is not on a list + * + * @node: &struct plist_node pointer + */ +static inline int plist_node_empty(const struct plist_node *node) +{ + return plist_head_empty(&node->plist); +} + +/* All functions below assume the plist_head is not empty. */ + +/** + * plist_first_entry - get the struct for the first entry + * + * @ptr: the &struct plist_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#ifdef CONFIG_DEBUG_PI_LIST +# define plist_first_entry(head, type, member) \ +({ \ + WARN_ON(plist_head_empty(head)); \ + container_of(plist_first(head), type, member); \ +}) +#else +# define plist_first_entry(head, type, member) \ + container_of(plist_first(head), type, member) +#endif + +/** + * plist_first - return the first node (and thus, highest priority) + * + * @head: the &struct plist_head pointer + * + * Assumes the plist is _not_ empty. + */ +static inline struct plist_node* plist_first(const struct plist_head *head) +{ + return list_entry(head->node_list.next, + struct plist_node, plist.node_list); +} + +#endif diff --git a/include/linux/poison.h b/include/linux/poison.h new file mode 100644 index 00000000000..a5347c02432 --- /dev/null +++ b/include/linux/poison.h @@ -0,0 +1,58 @@ +#ifndef _LINUX_POISON_H +#define _LINUX_POISON_H + +/********** include/linux/list.h **********/ +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +/********** mm/slab.c **********/ +/* + * Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. + */ +#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ +#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ + +/* ...and for poisoning */ +#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ +#define POISON_FREE 0x6b /* for use-after-free poisoning */ +#define POISON_END 0xa5 /* end-byte of poisoning */ + +/********** arch/$ARCH/mm/init.c **********/ +#define POISON_FREE_INITMEM 0xcc + +/********** arch/x86_64/mm/init.c **********/ +#define POISON_FREE_INITDATA 0xba + +/********** arch/ia64/hp/common/sba_iommu.c **********/ +/* + * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a + * value of "SBAIOMMU POISON\0" for spill-over poisoning. + */ + +/********** fs/jbd/journal.c **********/ +#define JBD_POISON_FREE 0x5b + +/********** drivers/base/dmapool.c **********/ +#define POOL_POISON_FREED 0xa7 /* !inuse */ +#define POOL_POISON_ALLOCATED 0xa9 /* !initted */ + +/********** drivers/atm/ **********/ +#define ATM_POISON_FREE 0x12 + +/********** kernel/mutexes **********/ +#define MUTEX_DEBUG_INIT 0x11 +#define MUTEX_DEBUG_FREE 0x22 + +/********** security/ **********/ +#define KEY_DESTROY 0xbd + +/********** sound/oss/ **********/ +#define OSS_POISON_FREE 0xAB + +#endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6312758393b..48dfe00070c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -258,6 +258,7 @@ extern void rcu_init(void); extern void rcu_check_callbacks(int cpu, int user); extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); /* Exported interfaces */ extern void FASTCALL(call_rcu(struct rcu_head *head, diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h new file mode 100644 index 00000000000..fa4a3b82ba7 --- /dev/null +++ b/include/linux/rtmutex.h @@ -0,0 +1,117 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains the public data structure and API definitions. + */ + +#ifndef __LINUX_RT_MUTEX_H +#define __LINUX_RT_MUTEX_H + +#include <linux/linkage.h> +#include <linux/plist.h> +#include <linux/spinlock_types.h> + +/* + * The rt_mutex structure + * + * @wait_lock: spinlock to protect the structure + * @wait_list: pilist head to enqueue waiters in priority order + * @owner: the mutex owner + */ +struct rt_mutex { + spinlock_t wait_lock; + struct plist_head wait_list; + struct task_struct *owner; +#ifdef CONFIG_DEBUG_RT_MUTEXES + int save_state; + struct list_head held_list_entry; + unsigned long acquire_ip; + const char *name, *file; + int line; + void *magic; +#endif +}; + +struct rt_mutex_waiter; +struct hrtimer_sleeper; + +#ifdef CONFIG_DEBUG_RT_MUTEXES + extern int rt_mutex_debug_check_no_locks_freed(const void *from, + unsigned long len); + extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task); +#else + static inline int rt_mutex_debug_check_no_locks_freed(const void *from, + unsigned long len) + { + return 0; + } +# define rt_mutex_debug_check_no_locks_held(task) do { } while (0) +#endif + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ + , .name = #mutexname, .file = __FILE__, .line = __LINE__ +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __FUNCTION__) + extern void rt_mutex_debug_task_free(struct task_struct *tsk); +#else +# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL) +# define rt_mutex_debug_task_free(t) do { } while (0) +#endif + +#define __RT_MUTEX_INITIALIZER(mutexname) \ + { .wait_lock = SPIN_LOCK_UNLOCKED \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ + , .owner = NULL \ + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} + +#define DEFINE_RT_MUTEX(mutexname) \ + struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) + +/*** + * rt_mutex_is_locked - is the mutex locked + * @lock: the mutex to be queried + * + * Returns 1 if the mutex is locked, 0 if unlocked. + */ +static inline int rt_mutex_is_locked(struct rt_mutex *lock) +{ + return lock->owner != NULL; +} + +extern void __rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void rt_mutex_destroy(struct rt_mutex *lock); + +extern void rt_mutex_lock(struct rt_mutex *lock); +extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock); +extern int rt_mutex_timed_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout, + int detect_deadlock); + +extern int rt_mutex_trylock(struct rt_mutex *lock); + +extern void rt_mutex_unlock(struct rt_mutex *lock); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define INIT_RT_MUTEX_DEBUG(tsk) \ + .held_list_head = LIST_HEAD_INIT(tsk.held_list_head), \ + .held_list_lock = SPIN_LOCK_UNLOCKED +#else +# define INIT_RT_MUTEX_DEBUG(tsk) +#endif + +#ifdef CONFIG_RT_MUTEXES +# define INIT_RT_MUTEXES(tsk) \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + INIT_RT_MUTEX_DEBUG(tsk) +#else +# define INIT_RT_MUTEXES(tsk) +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 122a25c1b99..821f0481ebe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -73,6 +73,7 @@ struct sched_param { #include <linux/seccomp.h> #include <linux/rcupdate.h> #include <linux/futex.h> +#include <linux/rtmutex.h> #include <linux/time.h> #include <linux/param.h> @@ -83,6 +84,7 @@ struct sched_param { #include <asm/processor.h> struct exec_domain; +struct futex_pi_state; /* * List of flags we want to share for kernel threads, @@ -123,6 +125,7 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern unsigned long weighted_cpuload(const int cpu); /* @@ -494,8 +497,11 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +#define has_rt_policy(p) \ + unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH) /* * Some day this will be a full-fledged user tracking system.. @@ -558,9 +564,9 @@ enum idle_type /* * sched-domains (multiprocessor balancing) declarations: */ -#ifdef CONFIG_SMP #define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ +#ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 4 /* Balance on exec */ @@ -569,6 +575,11 @@ enum idle_type #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ +#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ + +#define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \ + ? SD_POWERSAVINGS_BALANCE : 0) + struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -638,7 +649,7 @@ struct sched_domain { #endif }; -extern void partition_sched_domains(cpumask_t *partition1, +extern int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2); /* @@ -713,10 +724,13 @@ struct task_struct { int lock_depth; /* BKL lock depth */ -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#ifdef CONFIG_SMP +#ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; #endif - int prio, static_prio; +#endif + int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; struct list_head run_list; prio_array_t *array; @@ -843,6 +857,20 @@ struct task_struct { /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ spinlock_t alloc_lock; + /* Protection of the PI data structures: */ + spinlock_t pi_lock; + +#ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task */ + struct plist_head pi_waiters; + /* Deadlock detection and priority inheritance handling */ + struct rt_mutex_waiter *pi_blocked_on; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spinlock_t held_list_lock; + struct list_head held_list_head; +# endif +#endif + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; @@ -888,6 +916,8 @@ struct task_struct { #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; @@ -955,6 +985,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ +#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1009,6 +1040,19 @@ static inline void idle_task_exit(void) {} #endif extern void sched_idle_next(void); + +#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_getprio(task_t *p); +extern void rt_mutex_setprio(task_t *p, int prio); +extern void rt_mutex_adjust_pi(task_t *p); +#else +static inline int rt_mutex_getprio(task_t *p) +{ + return p->normal_prio; +} +# define rt_mutex_adjust_pi(p) do { } while (0) +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); @@ -1408,6 +1452,11 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); +#include <linux/sysdev.h> +extern int sched_mc_power_savings, sched_smt_power_savings; +extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings; +extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls); + extern void normalize_rt_tasks(void); #ifdef CONFIG_PM diff --git a/include/linux/scx200.h b/include/linux/scx200.h index a22f9e173ad..693c0557e70 100644 --- a/include/linux/scx200.h +++ b/include/linux/scx200.h @@ -49,10 +49,3 @@ extern unsigned scx200_cb_base; #define SCx200_REV 0x3d /* Revision Register */ #define SCx200_CBA 0x3e /* Configuration Base Address Register */ #define SCx200_CBA_SCRATCH 0x64 /* Configuration Base Address Scratchpad */ - -/* - Local variables: - compile-command: "make -C ../.. bzImage modules" - c-basic-offset: 8 - End: -*/ diff --git a/include/linux/scx200_gpio.h b/include/linux/scx200_gpio.h index 30cdd648ba7..90dd069cc14 100644 --- a/include/linux/scx200_gpio.h +++ b/include/linux/scx200_gpio.h @@ -1,6 +1,6 @@ #include <linux/spinlock.h> -u32 scx200_gpio_configure(int index, u32 set, u32 clear); +u32 scx200_gpio_configure(unsigned index, u32 set, u32 clear); extern unsigned scx200_gpio_base; extern long scx200_gpio_shadow[2]; @@ -17,7 +17,7 @@ extern long scx200_gpio_shadow[2]; /* returns the value of the GPIO pin */ -static inline int scx200_gpio_get(int index) { +static inline int scx200_gpio_get(unsigned index) { __SCx200_GPIO_BANK; __SCx200_GPIO_IOADDR + 0x04; __SCx200_GPIO_INDEX; @@ -29,7 +29,7 @@ static inline int scx200_gpio_get(int index) { driven if the GPIO is configured as an output, it might not be the state of the GPIO right now if the GPIO is configured as an input) */ -static inline int scx200_gpio_current(int index) { +static inline int scx200_gpio_current(unsigned index) { __SCx200_GPIO_BANK; __SCx200_GPIO_INDEX; @@ -38,7 +38,7 @@ static inline int scx200_gpio_current(int index) { /* drive the GPIO signal high */ -static inline void scx200_gpio_set_high(int index) { +static inline void scx200_gpio_set_high(unsigned index) { __SCx200_GPIO_BANK; __SCx200_GPIO_IOADDR; __SCx200_GPIO_SHADOW; @@ -49,7 +49,7 @@ static inline void scx200_gpio_set_high(int index) { /* drive the GPIO signal low */ -static inline void scx200_gpio_set_low(int index) { +static inline void scx200_gpio_set_low(unsigned index) { __SCx200_GPIO_BANK; __SCx200_GPIO_IOADDR; __SCx200_GPIO_SHADOW; @@ -60,7 +60,7 @@ static inline void scx200_gpio_set_low(int index) { /* drive the GPIO signal to state */ -static inline void scx200_gpio_set(int index, int state) { +static inline void scx200_gpio_set(unsigned index, int state) { __SCx200_GPIO_BANK; __SCx200_GPIO_IOADDR; __SCx200_GPIO_SHADOW; @@ -73,7 +73,7 @@ static inline void scx200_gpio_set(int index, int state) { } /* toggle the GPIO signal */ -static inline void scx200_gpio_change(int index) { +static inline void scx200_gpio_change(unsigned index) { __SCx200_GPIO_BANK; __SCx200_GPIO_IOADDR; __SCx200_GPIO_SHADOW; @@ -87,10 +87,3 @@ static inline void scx200_gpio_change(int index) { #undef __SCx200_GPIO_SHADOW #undef __SCx200_GPIO_INDEX #undef __SCx200_GPIO_OUT - -/* - Local variables: - compile-command: "make -C ../.. bzImage modules" - c-basic-offset: 8 - End: -*/ diff --git a/include/linux/swap.h b/include/linux/swap.h index dc3f3aa0c83..c41e2d6d1ac 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -199,6 +199,8 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif +extern int kswapd_run(int nid); + #ifdef CONFIG_MMU /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33785b79d54..008f04c5673 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -174,9 +174,9 @@ asmlinkage long sys_waitid(int which, pid_t pid, int options, struct rusage __user *ru); asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options); asmlinkage long sys_set_tid_address(int __user *tidptr); -asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, +asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec __user *utime, u32 __user *uaddr2, - int val3); + u32 val3); asmlinkage long sys_init_module(void __user *umod, unsigned long len, const char __user *uargs); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 349ef908a22..46e4d8f2771 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -149,6 +149,7 @@ enum KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ + KERN_MAX_LOCK_DEPTH=74, }; @@ -189,6 +190,7 @@ enum VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ + VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ }; diff --git a/include/linux/topology.h b/include/linux/topology.h index a305ae2e44b..ec1eca85290 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -134,7 +134,8 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE, \ + | SD_WAKE_AFFINE \ + | BALANCE_FOR_POWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/init/Kconfig b/init/Kconfig index df55b366560..f70f2fd273c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -339,9 +339,14 @@ config BASE_FULL kernel data structures. This saves memory on small machines, but may reduce performance. +config RT_MUTEXES + boolean + select PLIST + config FUTEX bool "Enable futex support" if EMBEDDED default y + select RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not diff --git a/init/main.c b/init/main.c index 80af1a52485..0d57f6ccb63 100644 --- a/init/main.c +++ b/init/main.c @@ -48,6 +48,7 @@ #include <linux/mempolicy.h> #include <linux/key.h> #include <linux/unwind.h> +#include <linux/buffer_head.h> #include <asm/io.h> #include <asm/bugs.h> @@ -80,7 +81,6 @@ extern void mca_init(void); extern void sbus_init(void); extern void sysctl_init(void); extern void signals_init(void); -extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); extern void prio_tree_init(void); diff --git a/kernel/Makefile b/kernel/Makefile index 752bd7d383a..82fb182f6f6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,9 @@ obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/acct.c b/kernel/acct.c index 368c4f03fe0..126ca43d5d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -521,6 +521,7 @@ static void do_acct_process(struct file *file) /** * acct_init_pacct - initialize a new pacct_struct + * @pacct: per-process accounting info struct to initialize */ void acct_init_pacct(struct pacct_struct *pacct) { @@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) * * handles process accounting for an exiting task */ -void acct_process() +void acct_process(void) { struct file *file = NULL; diff --git a/kernel/audit.c b/kernel/audit.c index 7dfac7031bd..82443fb433e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -818,7 +818,7 @@ err: */ unsigned int audit_serial(void) { - static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(serial_lock); static unsigned int serial = 0; unsigned long flags; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9ebd96fda29..dc5e3f01efe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab) return; error_path: - if (ctx) - kfree(ctx); + kfree(ctx); audit_panic("error in audit_log_task_context"); return; } @@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority - * @abs_timeout: Message timeout in absolute time + * @u_abs_timeout: Message timeout in absolute time * * Returns 0 for success or NULL context or < 0 on error. */ @@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive * @mqdes: MQ descriptor * @msg_len: Message length - * @msg_prio: Message priority - * @abs_timeout: Message timeout in absolute time + * @u_msg_prio: Message priority + * @u_abs_timeout: Message timeout in absolute time * * Returns 0 for success or NULL context or < 0 on error. */ @@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) - * @ipcp: in-kernel IPC permissions * * Returns 0 for success or NULL context or < 0 on error. */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 03dcd981846..70fbf2e8376 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -18,7 +18,7 @@ /* This protects CPUs going up and down... */ static DEFINE_MUTEX(cpucontrol); -static BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *lock_cpu_hotplug_owner; @@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int register_cpu_notifier(struct notifier_block *nb) +int __cpuinit register_cpu_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&cpu_chain, nb); } + +#ifdef CONFIG_HOTPLUG_CPU + EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) @@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -#ifdef CONFIG_HOTPLUG_CPU static inline void check_for_tasks(int cpu) { struct task_struct *p; diff --git a/kernel/exit.c b/kernel/exit.c index 304ef637be6..ab06b9f88f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -926,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); + /* * If DEBUG_MUTEXES is on, make sure we are holding no locks: */ mutex_debug_check_no_locks_held(tsk); + rt_mutex_debug_check_no_locks_held(tsk); if (tsk->io_context) exit_io_context(); diff --git a/kernel/fork.c b/kernel/fork.c index 9b4e54ef022..628198a4f28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); + rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ +#ifdef CONFIG_RT_MUTEXES + spin_lock_init(&p->pi_lock); + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spin_lock_init(&p->held_list_lock); + INIT_LIST_HEAD(&p->held_list_head); +# endif +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, mpol_fix_fork_child_flag(p); #endif + rt_mutex_init_task(p); + #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif @@ -1076,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; + /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/futex.c b/kernel/futex.c index e1a380c77a5..6c91f938005 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,6 +12,10 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -46,6 +50,8 @@ #include <linux/signal.h> #include <asm/futex.h> +#include "rtmutex_common.h" + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -63,7 +69,7 @@ union futex_key { int offset; } shared; struct { - unsigned long uaddr; + unsigned long address; struct mm_struct *mm; int offset; } private; @@ -75,6 +81,27 @@ union futex_key { }; /* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + +/* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * @@ -87,15 +114,19 @@ struct futex_q { struct list_head list; wait_queue_head_t waiters; - /* Which hash list lock to use. */ + /* Which hash list lock to use: */ spinlock_t *lock_ptr; - /* Key which the futex is hashed on. */ + /* Key which the futex is hashed on: */ union futex_key key; - /* For fd, sigio sent using these. */ + /* For fd, sigio sent using these: */ int fd; struct file *filp; + + /* Optional priority inheritance state: */ + struct futex_pi_state *pi_state; + struct task_struct *task; }; /* @@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ -static int get_futex_key(unsigned long uaddr, union futex_key *key) +static int get_futex_key(u32 __user *uaddr, union futex_key *key) { + unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct page *page; @@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * The futex address must be "naturally" aligned. */ - key->both.offset = uaddr % PAGE_SIZE; + key->both.offset = address % PAGE_SIZE; if (unlikely((key->both.offset % sizeof(u32)) != 0)) return -EINVAL; - uaddr -= key->both.offset; + address -= key->both.offset; /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ - vma = find_extend_vma(mm, uaddr); + vma = find_extend_vma(mm, address); if (unlikely(!vma)) return -EFAULT; @@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { key->private.mm = mm; - key->private.uaddr = uaddr; + key->private.address = address; return 0; } @@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) key->shared.inode = vma->vm_file->f_dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); return 0; } @@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) * from swap. But that's a lot of code to duplicate here * for a rare case, so we simply fetch the page. */ - err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); if (err >= 0) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key) } } -static inline int get_futex_value_locked(int *dest, int __user *from) +static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; inc_preempt_count(); - ret = __copy_from_user_inatomic(dest, from, sizeof(int)); + ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); dec_preempt_count(); return ret ? -EFAULT : 0; } /* + * Fault handling. Called with current->mm->mmap_sem held. + */ +static int futex_handle_fault(unsigned long address, int attempt) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + if (attempt >= 2 || !(vma = find_vma(mm, address)) || + vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) + return -EFAULT; + + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + return -EFAULT; + } + return 0; +} + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + memset(pi_state, 0, sizeof(*pi_state)); + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid)) { + p = NULL; + goto out_unlock; + } + if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + p = NULL; + goto out_unlock; + } + get_task_struct(p); +out_unlock: + read_unlock(&tasklist_lock); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct futex_hash_bucket *hb; + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + union futex_key key; + + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselfs + */ + spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + spin_unlock_irq(&curr->pi_lock); + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + list_del_init(&pi_state->list); + + WARN_ON(pi_state->owner != curr); + + pi_state->owner = NULL; + spin_unlock_irq(&curr->pi_lock); + + rt_mutex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + } + spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +{ + struct futex_pi_state *pi_state = NULL; + struct futex_q *this, *next; + struct list_head *head; + struct task_struct *p; + pid_t pid; + + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &me->key)) { + /* + * Another waiter already exists - bump up + * the refcount and return its pi_state: + */ + pi_state = this->pi_state; + atomic_inc(&pi_state->refcount); + me->pi_state = pi_state; + + return 0; + } + } + + /* + * We are the first waiter - try to look up the real owner and + * attach the new pi_state to it: + */ + pid = uval & FUTEX_TID_MASK; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make 'p' + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = me->key; + + spin_lock_irq(&p->pi_lock); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + me->pi_state = pi_state; + + return 0; +} + +/* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ @@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q) q->lock_ptr = NULL; } +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ + struct task_struct *new_owner; + struct futex_pi_state *pi_state = this->pi_state; + u32 curval, newval; + + if (!pi_state) + return -EINVAL; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* + * This happens when we have stolen the lock and the original + * pending owner did not enqueue itself back on the rt_mutex. + * Thats not a tragedy. We know that way, that a lock waiter + * is on the fly. We make the futex_q waiter the pending owner. + */ + if (!new_owner) + new_owner = this->task; + + /* + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) + */ + newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + + list_del_init(&pi_state->owner->pi_state_list); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + rt_mutex_unlock(&pi_state->pi_mutex); + + return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ + u32 oldval; + + /* + * There is no waiter, so we unlock the futex. The owner died + * bit has not to be preserved here. We are the owner: + */ + inc_preempt_count(); + oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); + dec_preempt_count(); + + if (oldval == -EFAULT) + return oldval; + if (oldval != uval) + return -EAGAIN; + + return 0; +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(unsigned long uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, int nr_wake) { - union futex_key key; - struct futex_hash_bucket *bh; - struct list_head *head; + struct futex_hash_bucket *hb; struct futex_q *this, *next; + struct list_head *head; + union futex_key key; int ret; down_read(¤t->mm->mmap_sem); @@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake) if (unlikely(ret != 0)) goto out; - bh = hash_futex(&key); - spin_lock(&bh->lock); - head = &bh->chain; + hb = hash_futex(&key); + spin_lock(&hb->lock); + head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { + if (this->pi_state) + return -EINVAL; wake_futex(this); if (++ret >= nr_wake) break; } } - spin_unlock(&bh->lock); + spin_unlock(&hb->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -324,10 +648,12 @@ out: * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +static int +futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; - struct futex_hash_bucket *bh1, *bh2; + struct futex_hash_bucket *hb1, *hb2; struct list_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; @@ -342,27 +668,29 @@ retryfull: if (unlikely(ret != 0)) goto out; - bh1 = hash_futex(&key1); - bh2 = hash_futex(&key2); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); retry: - if (bh1 < bh2) - spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); + if (hb1 < hb2) + spin_lock(&hb1->lock); + spin_lock(&hb2->lock); + if (hb1 > hb2) + spin_lock(&hb1->lock); - op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - int dummy; + u32 dummy; - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); #ifndef CONFIG_MMU - /* we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking */ + /* + * we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking + */ ret = op_ret; goto out; #endif @@ -372,47 +700,34 @@ retry: goto out; } - /* futex_atomic_op_inuser needs to both read and write + /* + * futex_atomic_op_inuser needs to both read and write * *(int __user *)uaddr2, but we can't modify it * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. */ + * still holding the mmap_sem. + */ if (attempt++) { - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - - ret = -EFAULT; - if (attempt >= 2 || - !(vma = find_vma(mm, uaddr2)) || - vma->vm_start > uaddr2 || - !(vma->vm_flags & VM_WRITE)) - goto out; - - switch (handle_mm_fault(mm, vma, uaddr2, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: + if (futex_handle_fault((unsigned long)uaddr2, + attempt)) goto out; - } goto retry; } - /* If we would have faulted, release mmap_sem, - * fault it in and start all over again. */ + /* + * If we would have faulted, release mmap_sem, + * fault it in and start all over again. + */ up_read(¤t->mm->mmap_sem); - ret = get_user(dummy, (int __user *)uaddr2); + ret = get_user(dummy, uaddr2); if (ret) return ret; goto retryfull; } - head = &bh1->chain; + head = &hb1->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { @@ -423,7 +738,7 @@ retry: } if (op_ret > 0) { - head = &bh2->chain; + head = &hb2->chain; op_ret = 0; list_for_each_entry_safe(this, next, head, list) { @@ -436,9 +751,9 @@ retry: ret += op_ret; } - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -448,11 +763,11 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, - int nr_wake, int nr_requeue, int *valp) +static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; - struct futex_hash_bucket *bh1, *bh2; + struct futex_hash_bucket *hb1, *hb2; struct list_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; @@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, if (unlikely(ret != 0)) goto out; - bh1 = hash_futex(&key1); - bh2 = hash_futex(&key2); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); - if (bh1 < bh2) - spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); + if (hb1 < hb2) + spin_lock(&hb1->lock); + spin_lock(&hb2->lock); + if (hb1 > hb2) + spin_lock(&hb1->lock); - if (likely(valp != NULL)) { - int curval; + if (likely(cmpval != NULL)) { + u32 curval; - ret = get_futex_value_locked(&curval, (int __user *)uaddr1); + ret = get_futex_value_locked(&curval, uaddr1); if (unlikely(ret)) { - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); - /* If we would have faulted, release mmap_sem, fault + /* + * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ up_read(¤t->mm->mmap_sem); - ret = get_user(curval, (int __user *)uaddr1); + ret = get_user(curval, uaddr1); if (!ret) goto retry; return ret; } - if (curval != *valp) { + if (curval != *cmpval) { ret = -EAGAIN; goto out_unlock; } } - head1 = &bh1->chain; + head1 = &hb1->chain; list_for_each_entry_safe(this, next, head1, list) { if (!match_futex (&this->key, &key1)) continue; if (++ret <= nr_wake) { wake_futex(this); } else { - list_move_tail(&this->list, &bh2->chain); - this->lock_ptr = &bh2->lock; + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(head1 != &hb2->chain)) { + list_move_tail(&this->list, &hb2->chain); + this->lock_ptr = &hb2->lock; + } this->key = key2; get_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) break; - /* Make sure to stop if key1 == key2 */ - if (head1 == &bh2->chain && head1 != &next->list) - head1 = &this->list; } } out_unlock: - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); /* drop_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) @@ -543,7 +862,7 @@ out: static inline struct futex_hash_bucket * queue_lock(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh; + struct futex_hash_bucket *hb; q->fd = fd; q->filp = filp; @@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); get_key_refs(&q->key); - bh = hash_futex(&q->key); - q->lock_ptr = &bh->lock; + hb = hash_futex(&q->key); + q->lock_ptr = &hb->lock; - spin_lock(&bh->lock); - return bh; + spin_lock(&hb->lock); + return hb; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { - list_add_tail(&q->list, &bh->chain); - spin_unlock(&bh->lock); + list_add_tail(&q->list, &hb->chain); + q->task = current; + spin_unlock(&hb->lock); } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { - spin_unlock(&bh->lock); + spin_unlock(&hb->lock); drop_key_refs(&q->key); } @@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) /* The key must be already stored in q->key. */ static void queue_me(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh; - bh = queue_lock(q, fd, filp); - __queue_me(q, bh); + struct futex_hash_bucket *hb; + + hb = queue_lock(q, fd, filp); + __queue_me(q, hb); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { - int ret = 0; spinlock_t *lock_ptr; + int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: @@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q) } WARN_ON(list_empty(&q->list)); list_del(&q->list); + + BUG_ON(q->pi_state); + spin_unlock(lock_ptr); ret = 1; } @@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q) return ret; } -static int futex_wait(unsigned long uaddr, int val, unsigned long time) +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) { - DECLARE_WAITQUEUE(wait, current); - int ret, curval; + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(&hb->lock); + + drop_key_refs(&q->key); +} + +static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +{ + struct task_struct *curr = current; + DECLARE_WAITQUEUE(wait, curr); + struct futex_hash_bucket *hb; struct futex_q q; - struct futex_hash_bucket *bh; + u32 uval; + int ret; + q.pi_state = NULL; retry: - down_read(¤t->mm->mmap_sem); + down_read(&curr->mm->mmap_sem); ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) goto out_release_sem; - bh = queue_lock(&q, -1, NULL); + hb = queue_lock(&q, -1, NULL); /* * Access the page AFTER the futex is queued. @@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) * We hold the mmap semaphore, so the mapping cannot have changed * since we looked it up in get_futex_key. */ - - ret = get_futex_value_locked(&curval, (int __user *)uaddr); + ret = get_futex_value_locked(&uval, uaddr); if (unlikely(ret)) { - queue_unlock(&q, bh); + queue_unlock(&q, hb); - /* If we would have faulted, release mmap_sem, fault it in and + /* + * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(¤t->mm->mmap_sem); + up_read(&curr->mm->mmap_sem); - ret = get_user(curval, (int __user *)uaddr); + ret = get_user(uval, uaddr); if (!ret) goto retry; return ret; } - if (curval != val) { - ret = -EWOULDBLOCK; - queue_unlock(&q, bh); - goto out_release_sem; - } + ret = -EWOULDBLOCK; + if (uval != val) + goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, bh); + __queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. - */ - up_read(¤t->mm->mmap_sem); + */ + up_read(&curr->mm->mmap_sem); /* * There might have been scheduling since the queue_me(), as we @@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) return 0; if (time == 0) return -ETIMEDOUT; - /* We expect signal_pending(current), but another thread may - * have handled it for us already. */ + /* + * We expect signal_pending(current), but another thread may + * have handled it for us already. + */ return -EINTR; + out_unlock_release_sem: + queue_unlock(&q, hb); + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, + struct hrtimer_sleeper *to) +{ + struct task_struct *curr = current; + struct futex_hash_bucket *hb; + u32 uval, newval, curval; + struct futex_q q; + int ret, attempt = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + + q.pi_state = NULL; + retry: + down_read(&curr->mm->mmap_sem); + + ret = get_futex_key(uaddr, &q.key); + if (unlikely(ret != 0)) + goto out_release_sem; + + hb = queue_lock(&q, -1, NULL); + + retry_locked: + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = current->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + + /* We own the lock already */ + if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (!detect && 0) + force_sig(SIGKILL, current); + ret = -EDEADLK; + goto out_unlock_release_sem; + } + + /* + * Surprise - we got the lock. Just return + * to userspace: + */ + if (unlikely(!curval)) + goto out_unlock_release_sem; + + uval = curval; + newval = uval | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, &q); + + if (unlikely(ret)) { + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. We also set the waiters bit + * unconditionally here, to simplify glibc handling of + * multiple tasks racing to acquire the lock and + * cleanup the problems which were left by the dead + * owner. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | + FUTEX_OWNER_DIED | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } + goto out_unlock_release_sem; + } + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + /* + * Now the futex is queued and we have checked the data, we + * don't want to hold mmap_sem while we sleep. + */ + up_read(&curr->mm->mmap_sem); + + WARN_ON(!q.pi_state); + /* + * Block on the PI mutex: + */ + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + down_read(&curr->mm->mmap_sem); + hb = queue_lock(&q, -1, NULL); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + u32 newtid = current->pid | FUTEX_WAITERS; + + /* Owner died? */ + if (q.pi_state->owner != NULL) { + spin_lock_irq(&q.pi_state->owner->pi_lock); + list_del_init(&q.pi_state->list); + spin_unlock_irq(&q.pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + q.pi_state->owner = current; + + spin_lock_irq(¤t->pi_lock); + list_add(&q.pi_state->list, ¤t->pi_state_list); + spin_unlock_irq(¤t->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + } + + if (!detect && ret == -EDEADLK && 0) + force_sig(SIGKILL, current); + + return ret; + + out_unlock_release_sem: + queue_unlock(&q, hb); + + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; + + uaddr_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock_release_sem; + + goto retry_locked; + } + + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + + return ret; +} + +/* + * Restart handler + */ +static long futex_lock_pi_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper timeout, *to = NULL; + int ret; + + restart->fn = do_no_restart_syscall; + + if (restart->arg2 || restart->arg3) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | + (u64) restart->arg0; + } + + pr_debug("lock_pi restart: %p, %d (%d)\n", + (u32 __user *)restart->arg0, current->pid); + + ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, + 0, to); + + if (ret != -EINTR) + return ret; + + restart->fn = futex_lock_pi_restart; + + /* The other values are filled in */ + return -ERESTART_RESTARTBLOCK; +} + +/* + * Called from the syscall entry below. + */ +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + int ret; + + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + + ret = do_futex_lock_pi(uaddr, detect, trylock, to); + + if (ret != -EINTR) + return ret; + + pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->arg0 = (unsigned long) uaddr; + restart->arg1 = detect; + if (to) { + restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = to->timer.expires.tv64 >> 32; + } else + restart->arg2 = restart->arg3 = 0; + + return -ERESTART_RESTARTBLOCK; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + u32 uval; + struct list_head *head; + union futex_key key; + int ret, attempt = 0; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != current->pid) + return -EPERM; + /* + * First take all the futex related locks: + */ + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &key); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + +retry_locked: + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking + * anyone else up: + */ + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + + if (unlikely(uval == -EFAULT)) + goto pi_faulted; + /* + * Rare case: we managed to release the lock atomically, + * no need to wake anyone else up: + */ + if (unlikely(uval == current->pid)) + goto out_unlock; + + /* + * Ok, other tasks may need to be woken up - check waiters + * and do the wakeup if necessary: + */ + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (!match_futex (&this->key, &key)) + continue; + ret = wake_futex_pi(uaddr, uval, this); + /* + * The atomic access to the futex value + * generated a pagefault, so retry the + * user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + goto out_unlock; + } + /* + * No waiters - kernel unlocks the futex: + */ + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + +out_unlock: + spin_unlock(&hb->lock); +out: up_read(¤t->mm->mmap_sem); + + return ret; + +pi_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock; + + goto retry_locked; + } + + spin_unlock(&hb->lock); + up_read(¤t->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + return ret; } @@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp) unqueue_me(q); kfree(q); + return 0; } @@ -766,7 +1519,7 @@ static struct file_operations futex_fops = { * Signal allows caller to avoid the race which would occur if they * set the sigio stuff up afterwards. */ -static int futex_fd(unsigned long uaddr, int signal) +static int futex_fd(u32 __user *uaddr, int signal) { struct futex_q *q; struct file *filp; @@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal) err = -ENOMEM; goto error; } + q->pi_state = NULL; down_read(¤t->mm->mmap_sem); err = get_futex_key(uaddr, &q->key); @@ -840,7 +1594,7 @@ error: * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after @@ -915,7 +1669,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - u32 uval; + u32 uval, nval; retry: if (get_user(uval, uaddr)) @@ -932,12 +1686,16 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - if (futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED) != uval) + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED); + if (nval == -EFAULT) + return -1; + + if (nval != uval) goto retry; if (uval & FUTEX_WAITERS) - futex_wake((unsigned long)uaddr, 1); + futex_wake(uaddr, 1); } return 0; } @@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr) while (entry != &head->list) { /* * A pending lock might already be on the list, so - * dont process it twice: + * don't process it twice: */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, @@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr) } } -long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, - unsigned long uaddr2, int val2, int val3) +long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, + u32 __user *uaddr2, u32 val2, u32 val3) { int ret; @@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; + case FUTEX_LOCK_PI: + ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + break; + case FUTEX_UNLOCK_PI: + ret = futex_unlock_pi(uaddr); + break; + case FUTEX_TRYLOCK_PI: + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + break; default: ret = -ENOSYS; } @@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, +asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec __user *utime, u32 __user *uaddr2, - int val3) + u32 val3) { struct timespec t; unsigned long timeout = MAX_SCHEDULE_TIMEOUT; - int val2 = 0; + u32 val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ - if (op >= FUTEX_REQUEUE) - val2 = (int) (unsigned long) utime; + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) + val2 = (u32) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } static int futexfs_get_sb(struct file_system_type *fs_type, diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d1..d1d92b441fb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (get_compat_timespec(&t, utime)) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (int) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 55601b3ce60..8d3dc29ef41 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, +static int __devinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block hrtimers_nb = { +static struct notifier_block __devinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 036b6285b15..e38e4bac97c 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/poison.h> #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/interrupt.h> @@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock, void debug_mutex_init_waiter(struct mutex_waiter *waiter) { - memset(waiter, 0x11, sizeof(*waiter)); + memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); } @@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_free_waiter(struct mutex_waiter *waiter) { DEBUG_WARN_ON(!list_empty(&waiter->list)); - memset(waiter, 0x22, sizeof(*waiter)); + memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index fc311a4673a..857b4fa0912 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -38,13 +38,22 @@ config PM_DEBUG config PM_TRACE bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86_32 - default y + depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL + default n ---help--- This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). + To use this debugging feature you should attempt to suspend the machine, + then reboot it, then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e50..5a730fdb1a2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -299,7 +299,7 @@ out: } #ifdef CONFIG_HOTPLUG_CPU -static int profile_cpu_callback(struct notifier_block *info, +static int __devinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 20e9710fc21..f464f5ae3f1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -182,6 +182,15 @@ long rcu_batches_completed(void) return rcu_ctrlblk.completed; } +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_ctrlblk.completed; +} + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int rcu_cpu_notify(struct notifier_block *self, +static int __devinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block rcu_nb = { +static struct notifier_block __devinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; @@ -619,6 +628,7 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); EXPORT_SYMBOL_GPL(call_rcu); EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d1..4d1c3d24712 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update /proc-based torture test facility + * Read-Copy Update module-based torture test facility * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ +static char *torture_type = "rcu"; /* What to torture. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); @@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -#define TORTURE_FLAG "rcutorture: " +module_param(torture_type, charp, 0); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); + +#define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ - do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) + do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static char printk_buf[4096]; @@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else - call_rcu(p, rcu_torture_cb); -} - struct rcu_random_state { unsigned long rrs_state; unsigned long rrs_count; @@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) } /* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferredfree)(struct rcu_torture *p); + int (*stats)(char *page); + char *name; +}; +static struct rcu_torture_ops *cur_ops = NULL; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_torture_read_unlock(int idx) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else + cur_ops->deferredfree(rp); +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferredfree = rcu_torture_deferred_free, + .stats = NULL, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferredfree = rcu_bh_torture_deferred_free, + .stats = NULL, + .name = "rcu_bh" +}; + +static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, &rcu_bh_ops, NULL }; + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if (rcu_batches_completed() == oldbatch) - continue; if ((rp = rcu_torture_alloc()) == NULL) continue; rp->rtort_pipe_count = 0; @@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); + cur_ops->deferredfree(old_rp); } rcu_torture_current_version++; - oldbatch = rcu_batches_completed(); + oldbatch = cur_ops->completed(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); while (!kthread_should_stop()) @@ -246,6 +339,7 @@ static int rcu_torture_reader(void *arg) { int completed; + int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; @@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) set_user_nice(current, 19); do { - rcu_read_lock(); - completed = rcu_batches_completed(); + idx = cur_ops->readlock(); + completed = cur_ops->completed(); p = rcu_dereference(rcu_torture_current); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcu_read_unlock(); + cur_ops->readunlock(idx); schedule_timeout_interruptible(HZ); continue; } @@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) pipe_count = RCU_TORTURE_PIPE_LEN; } ++__get_cpu_var(rcu_torture_count)[pipe_count]; - completed = rcu_batches_completed() - completed; + completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } ++__get_cpu_var(rcu_torture_batch)[completed]; preempt_enable(); - rcu_read_unlock(); + cur_ops->readunlock(idx); schedule(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); @@ -311,7 +405,7 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - cnt += sprintf(&page[cnt], "rcutorture: "); + cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d", @@ -324,7 +418,7 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_mberror)); if (atomic_read(&n_rcu_torture_mberror) != 0) cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); @@ -332,17 +426,19 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { cnt += sprintf(&page[cnt], " %d", atomic_read(&rcu_torture_wcount[i])); } cnt += sprintf(&page[cnt], "\n"); + if (cur_ops->stats != NULL) + cnt += cur_ops->stats(&page[cnt]); return cnt; } @@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) static inline void rcu_torture_print_module_parms(char *tag) { - printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " + printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval = %d\n", - tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, - shuffle_interval); + torture_type, tag, nrealreaders, stat_interval, verbose, + test_no_idle_hz, shuffle_interval); } static void @@ -493,6 +589,9 @@ rcu_torture_cleanup(void) rcu_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms("End of test: FAILURE"); else @@ -508,6 +607,20 @@ rcu_torture_init(void) /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) { + break; + } + } + if (cur_ops == NULL) { + printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", + torture_type); + return (-EINVAL); + } + if (cur_ops->init != NULL) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + if (nreaders >= 0) nrealreaders = nreaders; else diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a..2404f9b0bc4 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -232,6 +232,44 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Finds the lowest memory reosurce exists within [res->start.res->end) + * the caller must specify res->start, res->end, res->flags. + * If found, returns 0, res is overwritten, if not found, returns -1. + */ +int find_next_system_ram(struct resource *res) +{ + resource_size_t start, end; + struct resource *p; + + BUG_ON(!res); + + start = res->start; + end = res->end; + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + /* system ram is just marked as IORESOURCE_MEM */ + if (p->flags != res->flags) + continue; + if (p->start > end) { + p = NULL; + break; + } + if (p->start >= start) + break; + } + read_unlock(&resource_lock); + if (!p) + return -1; + /* copy data */ + res->start = p->start; + res->end = p->end; + return 0; +} +#endif + /* * Find empty slot in the resource tree given range and alignment. */ diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 00000000000..4aa8a2c9f45 --- /dev/null +++ b/kernel/rtmutex-debug.c @@ -0,0 +1,513 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This code is based on the rt.c implementation in the preempt-rt tree. + * Portions of said code are + * + * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Copyright (C) 2006 Esben Nielsen + * Copyright (C) 2006 Kihon Technologies Inc., + * Steven Rostedt <rostedt@goodmis.org> + * + * See rt.c in preempt-rt for proper credits and further information + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/plist.h> +#include <linux/fs.h> + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +# define TRACE_WARN_ON(x) WARN_ON(x) +# define TRACE_BUG_ON(x) BUG_ON(x) + +# define TRACE_OFF() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + if (spin_is_locked(¤t->pi_lock)) \ + spin_unlock(¤t->pi_lock); \ + if (spin_is_locked(¤t->held_list_lock)) \ + spin_unlock(¤t->held_list_lock); \ + } \ +} while (0) + +# define TRACE_OFF_NOLOCK() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + } \ +} while (0) + +# define TRACE_BUG_LOCKED() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +# define TRACE_WARN_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +# define TRACE_BUG_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) \ + TRACE_BUG_LOCKED(); \ +} while (0) + +#ifdef CONFIG_SMP +# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) +#else +# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) +#endif + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +int rt_trace_on = 1; + +void deadlock_trace_off(void) +{ + rt_trace_on = 0; +} + +static void printk_task(task_t *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk("<none>"); +} + +static void printk_task_short(task_t *p) +{ + if (p) + printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk("<none>"); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && rt_mutex_owner(lock)) { + printk(".. ->owner: %p\n", lock->owner); + printk(".. held by: "); + printk_task(rt_mutex_owner(lock)); + printk("\n"); + } + if (rt_mutex_owner(lock)) { + printk("... acquired at: "); + print_symbol("%s\n", lock->acquire_ip); + } +} + +static void printk_waiter(struct rt_mutex_waiter *w) +{ + printk("-------------------------\n"); + printk("| waiter struct %p:\n", w); + printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", + w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, + w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, + w->list_entry.prio); + printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", + w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, + w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, + w->pi_list_entry.prio); + printk("\n| lock:\n"); + printk_lock(w->lock, 1); + printk("| w->ti->task:\n"); + printk_task(w->task); + printk("| blocked at: "); + print_symbol("%s\n", w->ip); + printk("-------------------------\n"); +} + +static void show_task_locks(task_t *p) +{ + switch (p->state) { + case TASK_RUNNING: printk("R"); break; + case TASK_INTERRUPTIBLE: printk("S"); break; + case TASK_UNINTERRUPTIBLE: printk("D"); break; + case TASK_STOPPED: printk("T"); break; + case EXIT_ZOMBIE: printk("Z"); break; + case EXIT_DEAD: printk("X"); break; + default: printk("?"); break; + } + printk_task(p); + if (p->pi_blocked_on) { + struct rt_mutex *lock = p->pi_blocked_on->lock; + + printk(" blocked on:"); + printk_lock(lock, 1); + } else + printk(" (not blocked)\n"); +} + +void rt_mutex_show_held_locks(task_t *task, int verbose) +{ + struct list_head *curr, *cursor = NULL; + struct rt_mutex *lock; + task_t *t; + unsigned long flags; + int count = 0; + + if (!rt_trace_on) + return; + + if (verbose) { + printk("------------------------------\n"); + printk("| showing all locks held by: | ("); + printk_task_short(task); + printk("):\n"); + printk("------------------------------\n"); + } + +next: + spin_lock_irqsave(&task->held_list_lock, flags); + list_for_each(curr, &task->held_list_head) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list_entry); + t = rt_mutex_owner(lock); + WARN_ON(t != task); + count++; + cursor = curr->next; + spin_unlock_irqrestore(&task->held_list_lock, flags); + + printk("\n#%03d: ", count); + printk_lock(lock, 0); + goto next; + } + spin_unlock_irqrestore(&task->held_list_lock, flags); + + printk("\n"); +} + +void rt_mutex_show_all_locks(void) +{ + task_t *g, *p; + int count = 10; + int unlock = 1; + + printk("\n"); + printk("----------------------\n"); + printk("| showing all tasks: |\n"); + printk("----------------------\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("\n"); + + printk("-----------------------------------------\n"); + printk("| showing all locks held in the system: |\n"); + printk("-----------------------------------------\n"); + + do_each_thread(g, p) { + rt_mutex_show_held_locks(p, 0); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +void rt_mutex_debug_check_no_locks_held(task_t *task) +{ + struct rt_mutex_waiter *w; + struct list_head *curr; + struct rt_mutex *lock; + + if (!rt_trace_on) + return; + if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { + printk("BUG: PI priority boost leaked!\n"); + printk_task(task); + printk("\n"); + } + if (list_empty(&task->held_list_head)) + return; + + spin_lock(&task->pi_lock); + plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { + TRACE_OFF(); + + printk("hm, PI interest held at exit time? Task:\n"); + printk_task(task); + printk_waiter(w); + return; + } + spin_unlock(&task->pi_lock); + + list_for_each(curr, &task->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + + printk("BUG: %s/%d, lock held at task exit time!\n", + task->comm, task->pid); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != task) + printk("exiting task is not even the owner??\n"); + } +} + +int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) +{ + const void *to = from + len; + struct list_head *curr; + struct rt_mutex *lock; + unsigned long flags; + void *lock_addr; + + if (!rt_trace_on) + return 0; + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_for_each(curr, ¤t->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + lock_addr = lock; + if (lock_addr < from || lock_addr >= to) + continue; + TRACE_OFF(); + + printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", + current->comm, current->pid, lock, from, to); + dump_stack(); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != current) + printk("freeing task is not even the owner??\n"); + return 1; + } + spin_unlock_irqrestore(¤t->held_list_lock, flags); + + return 0; +} + +void rt_mutex_debug_task_free(struct task_struct *task) +{ + WARN_ON(!plist_head_empty(&task->pi_waiters)); + WARN_ON(task->pi_blocked_on); +} + +/* + * We fill out the fields in the waiter to store the information about + * the deadlock. We print when we return. act_waiter can be NULL in + * case of a remove waiter operation. + */ +void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, + struct rt_mutex *lock) +{ + struct task_struct *task; + + if (!rt_trace_on || detect || !act_waiter) + return; + + task = rt_mutex_owner(act_waiter->lock); + if (task && task != current) { + act_waiter->deadlock_task_pid = task->pid; + act_waiter->deadlock_lock = lock; + } +} + +void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +{ + struct task_struct *task; + + if (!waiter->deadlock_lock || !rt_trace_on) + return; + + task = find_task_by_pid(waiter->deadlock_task_pid); + if (!task) + return; + + TRACE_OFF_NOLOCK(); + + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task->pid, current->comm, current->pid); + + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, current->pid); + printk_lock(waiter->lock, 1); + + printk("... trying at: "); + print_symbol("%s\n", waiter->ip); + + printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); + printk_lock(waiter->deadlock_lock, 1); + + rt_mutex_show_held_locks(current, 1); + rt_mutex_show_held_locks(task, 1); + + printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, current->pid); + dump_stack(); + rt_mutex_show_all_locks(); + printk("[ turning off deadlock detection." + "Please report this trace. ]\n\n"); + local_irq_disable(); +} + +void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, ¤t->held_list_head); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + + lock->acquire_ip = ip; + } +} + +void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + } +} + +void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&powner->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, &powner->held_list_head); + spin_unlock_irqrestore(&powner->held_list_lock, flags); + + lock->acquire_ip = ip; + } +} + +void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + unsigned long flags; + + if (rt_trace_on) { + struct task_struct *owner = rt_mutex_owner(lock); + + TRACE_WARN_ON_LOCKED(!owner); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&owner->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(&owner->held_list_lock, flags); + } +} + +void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + memset(waiter, 0x11, sizeof(*waiter)); + plist_node_init(&waiter->list_entry, MAX_PRIO); + plist_node_init(&waiter->pi_list_entry, MAX_PRIO); +} + +void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); + TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + TRACE_WARN_ON(waiter->task); + memset(waiter, 0x22, sizeof(*waiter)); +} + +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + void *addr = lock; + + if (rt_trace_on) { + rt_mutex_debug_check_no_locks_freed(addr, + sizeof(struct rt_mutex)); + INIT_LIST_HEAD(&lock->held_list_entry); + lock->name = name; + } +} + +void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) +{ +} + +void rt_mutex_deadlock_account_unlock(struct task_struct *task) +{ +} + diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 00000000000..7612fbc62d7 --- /dev/null +++ b/kernel/rtmutex-debug.h @@ -0,0 +1,37 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +#define __IP_DECL__ , unsigned long ip +#define __IP__ , ip +#define __RET_IP__ , (unsigned long)__builtin_return_address(0) + +extern void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); +extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); +extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__); +extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, + struct rt_mutex *lock); +extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +# define debug_rt_mutex_reset_waiter(w) \ + do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + int detect) +{ + return (waiter != NULL); +} diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 00000000000..e82c2f84824 --- /dev/null +++ b/kernel/rtmutex-tester.c @@ -0,0 +1,440 @@ +/* + * RT-Mutex-tester: scriptable tester for rt mutexes + * + * started by Thomas Gleixner: + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + */ +#include <linux/config.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/spinlock.h> +#include <linux/sysdev.h> +#include <linux/timer.h> + +#include "rtmutex.h" + +#define MAX_RT_TEST_THREADS 8 +#define MAX_RT_TEST_MUTEXES 8 + +static spinlock_t rttest_lock; +static atomic_t rttest_event; + +struct test_thread_data { + int opcode; + int opdata; + int mutexes[MAX_RT_TEST_MUTEXES]; + int bkl; + int event; + struct sys_device sysdev; +}; + +static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; +static task_t *threads[MAX_RT_TEST_THREADS]; +static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; + +enum test_opcodes { + RTTEST_NOP = 0, + RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ + RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ + RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ + RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ + RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ + RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ + RTTEST_LOCKBKL, /* 9 Lock BKL */ + RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ + RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ + RTTEST_RESET = 99, /* 99 Reset all pending operations */ +}; + +static int handle_op(struct test_thread_data *td, int lockwakeup) +{ + int i, id, ret = -EINVAL; + + switch(td->opcode) { + + case RTTEST_NOP: + return 0; + + case RTTEST_LOCKCONT: + td->mutexes[td->opdata] = 1; + td->event = atomic_add_return(1, &rttest_event); + return 0; + + case RTTEST_RESET: + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { + if (td->mutexes[i] == 4) { + rt_mutex_unlock(&mutexes[i]); + td->mutexes[i] = 0; + } + } + + if (!lockwakeup && td->bkl == 4) { + unlock_kernel(); + td->bkl = 0; + } + return 0; + + case RTTEST_RESETEVENT: + atomic_set(&rttest_event, 0); + return 0; + + default: + if (lockwakeup) + return ret; + } + + switch(td->opcode) { + + case RTTEST_LOCK: + case RTTEST_LOCKNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_lock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 4; + return 0; + + case RTTEST_LOCKINT: + case RTTEST_LOCKINTNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + ret = rt_mutex_lock_interruptible(&mutexes[id], 0); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = ret ? 0 : 4; + return ret ? -EINTR : 0; + + case RTTEST_UNLOCK: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) + return ret; + + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_unlock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 0; + return 0; + + case RTTEST_LOCKBKL: + if (td->bkl) + return 0; + td->bkl = 1; + lock_kernel(); + td->bkl = 4; + return 0; + + case RTTEST_UNLOCKBKL: + if (td->bkl != 4) + break; + unlock_kernel(); + td->bkl = 0; + return 0; + + default: + break; + } + return ret; +} + +/* + * Schedule replacement for rtsem_down(). Only called for threads with + * PF_MUTEX_TESTER set. + * + * This allows us to have finegrained control over the event flow. + * + */ +void schedule_rt_mutex_test(struct rt_mutex *mutex) +{ + int tid, op, dat; + struct test_thread_data *td; + + /* We have to lookup the task */ + for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { + if (threads[tid] == current) + break; + } + + BUG_ON(tid == MAX_RT_TEST_THREADS); + + td = &thread_data[tid]; + + op = td->opcode; + dat = td->opdata; + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + break; + + if (td->mutexes[dat] != 1) + break; + + td->mutexes[dat] = 2; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKBKL: + default: + break; + } + + schedule(); + + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 3; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 1; + td->event = atomic_add_return(1, &rttest_event); + return; + + case RTTEST_LOCKBKL: + return; + default: + return; + } + + td->opcode = 0; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + int ret; + + set_current_state(TASK_RUNNING); + ret = handle_op(td, 1); + set_current_state(TASK_INTERRUPTIBLE); + if (td->opcode == RTTEST_LOCKCONT) + break; + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + } + + /* Restore previous command and data */ + td->opcode = op; + td->opdata = dat; +} + +static int test_func(void *data) +{ + struct test_thread_data *td = data; + int ret; + + current->flags |= PF_MUTEX_TESTER; + allow_signal(SIGHUP); + + for(;;) { + + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + set_current_state(TASK_RUNNING); + ret = handle_op(td, 0); + set_current_state(TASK_INTERRUPTIBLE); + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + + if (signal_pending(current)) + flush_signals(current); + + if(kthread_should_stop()) + break; + } + return 0; +} + +/** + * sysfs_test_command - interface for test commands + * @dev: thread reference + * @buf: command for actual step + * @count: length of buffer + * + * command syntax: + * + * opcode:data + */ +static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, + size_t count) +{ + struct sched_param schedpar; + struct test_thread_data *td; + char cmdbuf[32]; + int op, dat, tid, ret; + + td = container_of(dev, struct test_thread_data, sysdev); + tid = td->sysdev.id; + + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(cmdbuf)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + memcpy(cmdbuf, buf, count); + cmdbuf[count] = 0; + + if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) + return -EINVAL; + + switch (op) { + case RTTEST_SCHEDOT: + schedpar.sched_priority = 0; + ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); + if (ret) + return ret; + set_user_nice(current, 0); + break; + + case RTTEST_SCHEDRT: + schedpar.sched_priority = dat; + ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); + if (ret) + return ret; + break; + + case RTTEST_SIGNAL: + send_sig(SIGHUP, threads[tid], 0); + break; + + default: + if (td->opcode > 0) + return -EBUSY; + td->opdata = dat; + td->opcode = op; + wake_up_process(threads[tid]); + } + + return count; +} + +/** + * sysfs_test_status - sysfs interface for rt tester + * @dev: thread to query + * @buf: char buffer to be filled with thread status info + */ +static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) +{ + struct test_thread_data *td; + char *curr = buf; + task_t *tsk; + int i; + + td = container_of(dev, struct test_thread_data, sysdev); + tsk = threads[td->sysdev.id]; + + spin_lock(&rttest_lock); + + curr += sprintf(curr, + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + td->opcode, td->event, tsk->state, + (MAX_RT_PRIO - 1) - tsk->prio, + (MAX_RT_PRIO - 1) - tsk->normal_prio, + tsk->pi_blocked_on, td->bkl); + + for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) + curr += sprintf(curr, "%d", td->mutexes[i]); + + spin_unlock(&rttest_lock); + + curr += sprintf(curr, ", T: %p, R: %p\n", tsk, + mutexes[td->sysdev.id].owner); + + return curr - buf; +} + +static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); +static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); + +static struct sysdev_class rttest_sysclass = { + set_kset_name("rttest"), +}; + +static int init_test_thread(int id) +{ + thread_data[id].sysdev.cls = &rttest_sysclass; + thread_data[id].sysdev.id = id; + + threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); + if (IS_ERR(threads[id])) + return PTR_ERR(threads[id]); + + return sysdev_register(&thread_data[id].sysdev); +} + +static int init_rttest(void) +{ + int ret, i; + + spin_lock_init(&rttest_lock); + + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) + rt_mutex_init(&mutexes[i]); + + ret = sysdev_class_register(&rttest_sysclass); + if (ret) + return ret; + + for (i = 0; i < MAX_RT_TEST_THREADS; i++) { + ret = init_test_thread(i); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + if (ret) + break; + } + + printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); + + return ret; +} + +device_initcall(init_rttest); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 00000000000..45d61016da5 --- /dev/null +++ b/kernel/rtmutex.c @@ -0,0 +1,990 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/timer.h> + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 + * are used to keep track of the "owner is pending" and "lock has + * waiters" state. + * + * owner bit1 bit0 + * NULL 0 0 lock is free (fast acquire possible) + * NULL 0 1 invalid state + * NULL 1 0 Transitional State* + * NULL 1 1 invalid state + * taskpointer 0 0 lock is held (fast release possible) + * taskpointer 0 1 task is pending owner + * taskpointer 1 0 lock is held and has waiters + * taskpointer 1 1 task is pending owner and lock has more waiters + * + * Pending ownership is assigned to the top (highest priority) + * waiter of the lock, when the lock is released. The thread is woken + * up and can now take the lock. Until the lock is taken (bit 0 + * cleared) a competing higher priority thread can steal the lock + * which puts the woken up thread back on the waiters list. + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 and 1 of lock->owner are 0. + * + * (*) There's a small time where the owner can be NULL and the + * "lock has waiters" bit is set. This can happen when grabbing the lock. + * To prevent a cmpxchg of the owner releasing the lock, we need to set this + * bit before looking at the lock, hence the reason this is a transitional + * state. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(task_t *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task + __IP_DECL__) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + top_task->comm, top_task->pid); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter || !waiter->task) + goto out_unlock_pi; + + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!spin_trylock(&lock->wait_lock)) { + spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + return ret; +} + +/* + * Optimization: check if we can steal the lock from the + * assigned pending owner [which might not have taken the + * lock yet]: + */ +static inline int try_to_steal_lock(struct rt_mutex *lock) +{ + struct task_struct *pendowner = rt_mutex_owner(lock); + struct rt_mutex_waiter *next; + unsigned long flags; + + if (!rt_mutex_owner_pending(lock)) + return 0; + + if (pendowner == current) + return 1; + + spin_lock_irqsave(&pendowner->pi_lock, flags); + if (current->prio >= pendowner->prio) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 0; + } + + /* + * Check if a waiter is enqueued on the pending owners + * pi_waiters list. Remove it and readjust pending owners + * priority. + */ + if (likely(!rt_mutex_has_waiters(lock))) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 1; + } + + /* No chain handling, pending owner is not blocked on anything: */ + next = rt_mutex_top_waiter(lock); + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + __rt_mutex_adjust_prio(pendowner); + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + /* + * We are going to steal the lock and a waiter was + * enqueued on the pending owners pi_waiters queue. So + * we have to enqueue this waiter into + * current->pi_waiters list. This covers the case, + * where current is boosted because it holds another + * lock and gets unboosted because the booster is + * interrupted, so we would delay a waiter with higher + * priority as current->normal_prio. + * + * Note: in the rare case of a SCHED_OTHER task changing + * its priority and thus stealing the lock, next->task + * might be current: + */ + if (likely(next->task != current)) { + spin_lock_irqsave(¤t->pi_lock, flags); + plist_add(&next->pi_list_entry, ¤t->pi_waiters); + __rt_mutex_adjust_prio(current); + spin_unlock_irqrestore(¤t->pi_lock, flags); + } + return 1; +} + +/* + * Try to take an rt-mutex + * + * This fails + * - when the lock has a real owner + * - when a different pending owner exists and has higher priority than current + * + * Must be called with lock->wait_lock held. + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + return 0; + + /* We got the lock. */ + debug_rt_mutex_lock(lock __IP__); + + rt_mutex_set_owner(lock, current, 0); + + rt_mutex_deadlock_account_lock(lock, current); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock + __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + task_t *owner = rt_mutex_owner(lock); + int boost = 0, res; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + __rt_mutex_adjust_prio(current); + waiter->task = current; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + if (owner->pi_blocked_on) { + boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + if (!boost) + return 0; + + spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, + current __IP__); + + spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and from + * the lock waiter list. Set it as pending owner. Then wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + plist_del(&waiter->list_entry, &lock->wait_list); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + pendowner = waiter->task; + waiter->task = NULL; + + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + /* + * Clear the pi_blocked_on variable and enqueue a possible + * waiter into the pi_waiters list of the pending owner. This + * prevents that in case the pending owner gets unboosted a + * waiter with higher priority than pending-owner->normal_prio + * is blocked on the unboosted (pending) owner. + */ + spin_lock_irqsave(&pendowner->pi_lock, flags); + + WARN_ON(!pendowner->pi_blocked_on); + WARN_ON(pendowner->pi_blocked_on != waiter); + WARN_ON(pendowner->pi_blocked_on->lock != lock); + + pendowner->pi_blocked_on = NULL; + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); + } + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + wake_up_process(pendowner); +} + +/* + * Remove a waiter from a lock + * + * Must be called with lock->wait_lock held + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter __IP_DECL__) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + int boost = 0; + task_t *owner = rt_mutex_owner(lock); + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (first && owner != current) { + + spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) { + boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!boost) + return; + + spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); + + spin_lock(&lock->wait_lock); +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || waiter->list_entry.prio == task->prio) { + spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock __IP__)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) + hrtimer_start(&timeout->timer, timeout->timer.expires, + HRTIMER_ABS); + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock __IP__)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + ret = task_blocks_on_rt_mutex(lock, &waiter, + detect_deadlock __IP__); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) + continue; + + if (unlikely(ret)) + break; + } + + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(&waiter); + + if (waiter.task) + schedule_rt_mutex(lock); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter __IP__); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + /* + * Readjust priority, when we did not get the lock. We might + * have been the pending owner and boosted. Since we did not + * take the lock, the PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) +{ + int ret = 0; + + spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock __IP__); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock __RET_IP__); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/*** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list, &lock->wait_lock); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + __rt_mutex_init(lock, NULL); + debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); + rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_deadlock_account_lock(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_deadlock_account_unlock(proxy_owner); +} + +/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query + * + * Returns the next owner of the lock or NULL + * + * Caller has to serialize against other accessors to the lock + * itself. + * + * Special API call for PI-futex support + */ +struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + return NULL; + + return rt_mutex_top_waiter(lock)->task; +} diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 00000000000..1e0fca13ff7 --- /dev/null +++ b/kernel/rtmutex.h @@ -0,0 +1,29 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define __IP_DECL__ +#define __IP__ +#define __RET_IP__ +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 00000000000..9c75856e791 --- /dev/null +++ b/kernel/rtmutex_common.h @@ -0,0 +1,123 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include <linux/rtmutex.h> + +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock) \ + do { \ + if (!(current->flags & PF_MUTEX_TESTER)) \ + schedule(); \ + else \ + schedule_rt_mutex_test(_lock); \ + } while (0) + +#else +# define schedule_rt_mutex(_lock) schedule() +#endif + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + pid_t deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_OWNER_PENDING 1UL +#define RT_MUTEX_HAS_WAITERS 2UL +#define RT_MUTEX_OWNER_MASKALL 3UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) +{ + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; +} + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); +#endif diff --git a/kernel/sched.c b/kernel/sched.c index a856040c200..2629c1711fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -168,15 +168,21 @@ */ #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) -static unsigned int task_timeslice(task_t *p) +static unsigned int static_prio_timeslice(int static_prio) { - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); + return SCALE_PRIO(DEF_TIMESLICE, static_prio); } + +static inline unsigned int task_timeslice(task_t *p) +{ + return static_prio_timeslice(p->static_prio); +} + #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) @@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - typedef struct runqueue runqueue_t; struct prio_array { unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; + DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_PRIO]; }; @@ -209,6 +213,7 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; + unsigned long raw_weighted_load; #ifdef CONFIG_SMP unsigned long cpu_load[3]; #endif @@ -239,7 +244,6 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; - int cpu; #endif #ifdef CONFIG_SCHEDSTATS @@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline runqueue_t *__task_rq_lock(task_t *p) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); + goto repeat_lock_task; + } + return rq; +} + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) __acquires(rq->lock) { struct runqueue *rq; @@ -371,6 +394,12 @@ repeat_lock_task: return rq; } +static inline void __task_rq_unlock(runqueue_t *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) __releases(rq->lock) { @@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -665,6 +692,106 @@ static int effective_prio(task_t *p) } /* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +/* + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * If static_prio_timeslice() is ever changed to break this assumption then + * this code will need modification + */ +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(static_prio_timeslice(prio)) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + +static void set_load_weight(task_t *p) +{ + if (has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} + +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) +{ + rq->raw_weighted_load += p->load_weight; +} + +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) +{ + rq->raw_weighted_load -= p->load_weight; +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + inc_raw_weighted_load(rq, p); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + dec_raw_weighted_load(rq, p); +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* * __activate_task - move a task to the runqueue. */ static void __activate_task(task_t *p, runqueue_t *rq) @@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) if (batch_task(p)) target = rq->expired; enqueue_task(p, target); - rq->nr_running++; + inc_nr_running(p, rq); } /* @@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; + unsigned long sleep_time = now - p->timestamp; if (batch_task(p)) sleep_time = 0; - else { - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - } if (likely(sleep_time > 0)) { /* - * User tasks that sleep a long time are categorised as - * idle. They will only have their sleep_avg increased to a - * level that makes them just interactive priority to stay - * active yet prevent them suddenly becoming cpu hogs and - * starving other processes. + * This ceiling is set to the lowest priority that would allow + * a task to be reinserted into the active array on timeslice + * completion. */ - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { - unsigned long ceiling; + unsigned long ceiling = INTERACTIVE_SLEEP(p); - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - if (p->sleep_avg < ceiling) - p->sleep_avg = ceiling; + if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { + /* + * Prevents user tasks from achieving best priority + * with one single large enough sleep. + */ + p->sleep_avg = ceiling; + /* + * Using INTERACTIVE_SLEEP() as a ceiling places a + * nice(0) task 1ms sleep away from promotion, and + * gives it 700ms to round-robin with no chance of + * being demoted. This is more than generous, so + * mark this sleep as non-interactive to prevent the + * on-runqueue bonus logic from intervening should + * this task not receive cpu immediately. + */ + p->sleep_type = SLEEP_NONINTERACTIVE; } else { /* * Tasks waking from uninterruptible sleep are @@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) * are likely to be waiting on I/O */ if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + if (p->sleep_avg >= ceiling) sleep_time = 0; else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; + ceiling) { + p->sleep_avg = ceiling; + sleep_time = 0; } } @@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) */ p->sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; } + if (p->sleep_avg > NS_MAX_SLEEP_AVG) + p->sleep_avg = NS_MAX_SLEEP_AVG; } return effective_prio(p); @@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - rq->nr_running--; + dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; } @@ -860,6 +993,12 @@ inline int task_curr(const task_t *p) return cpu_curr(task_cpu(p)) == p; } +/* Used instead of source_load when we know the type == 0 */ +unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->raw_weighted_load; +} + #ifdef CONFIG_SMP typedef struct { struct list_head list; @@ -949,7 +1088,8 @@ void kick_process(task_t *p) } /* - * Return a low guess at the load of a migration-source cpu. + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to * balance conservatively. @@ -957,24 +1097,36 @@ void kick_process(task_t *p) static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + return rq->raw_weighted_load; - return min(rq->cpu_load[type-1], load_now); + return min(rq->cpu_load[type-1], rq->raw_weighted_load); } /* - * Return a high guess at the load of a migration-target cpu + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. */ static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + return rq->raw_weighted_load; + + return max(rq->cpu_load[type-1], rq->raw_weighted_load); +} + +/* + * Return the average load per task on the cpu's run queue + */ +static inline unsigned long cpu_avg_load_per_task(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long n = rq->nr_running; - return max(rq->cpu_load[type-1], load_now); + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; } /* @@ -1047,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) cpus_and(tmp, group->cpumask, p->cpus_allowed); for_each_cpu_mask(i, tmp) { - load = source_load(i, 0); + load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -1074,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag) struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; - for_each_domain(cpu, tmp) + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; if (tmp->flags & flag) sd = tmp; + } while (sd) { cpumask_t span; @@ -1226,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; + unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: */ if (sync) - tl -= SCHED_LOAD_SCALE; + tl -= current->load_weight; if ((tl <= load && - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->load_weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1353,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); p->array = NULL; #ifdef CONFIG_SCHEDSTATS @@ -1432,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - rq->nr_running++; + inc_nr_running(p, rq); } set_need_resched(); } else @@ -1653,7 +1820,8 @@ unsigned long nr_uninterruptible(void) unsigned long long nr_context_switches(void) { - unsigned long long i, sum = 0; + int i; + unsigned long long sum = 0; for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; @@ -1691,9 +1859,6 @@ unsigned long nr_active(void) /* * double_rq_lock - safely lock two runqueues * - * We must take them in cpu order to match code in - * dependent_sleeper and wake_dependent_sleeper. - * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ @@ -1705,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1->cpu < rq2->cpu) { + if (rq1 < rq2) { spin_lock(&rq1->lock); spin_lock(&rq2->lock); } else { @@ -1741,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) __acquires(this_rq->lock) { if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest->cpu < this_rq->cpu) { + if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); @@ -1804,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - src_rq->nr_running--; + dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); - this_rq->nr_running++; + inc_nr_running(p, this_rq); enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -1853,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) /* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. * * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle, int *all_pinned) + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0, pinned = 0; + int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; + int busiest_best_prio_seen; + int skip_for_load; /* skip the task based on weighted load issues */ + long rem_load_move; task_t *tmp; - if (max_nr_move == 0) + if (max_nr_move == 0 || max_load_move == 0) goto out; + rem_load_move = max_load_move; pinned = 1; + this_best_prio = rq_best_prio(this_rq); + busiest_best_prio = rq_best_prio(busiest); + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==busiest_best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) of + * any task we find with that prio. + */ + busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; /* * We first consider expired tasks. Those will likely not be @@ -1912,7 +2093,17 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + /* + * To help distribute high priority tasks accross CPUs we don't + * skip a task if it will be the highest priority task (i.e. smallest + * prio value) on its new queue regardless of its load weight + */ + skip_for_load = tmp->load_weight > rem_load_move; + if (skip_for_load && idx < this_best_prio) + skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; + if (skip_for_load || + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + busiest_best_prio_seen |= idx == busiest_best_prio; if (curr != head) goto skip_queue; idx++; @@ -1926,9 +2117,15 @@ skip_queue: pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; + rem_load_move -= tmp->load_weight; - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { + /* + * We only want to steal up to the prescribed number of tasks + * and the prescribed amount of weighted load. + */ + if (pulled < max_nr_move && rem_load_move > 0) { + if (idx < this_best_prio) + this_best_prio = idx; if (curr != head) goto skip_queue; idx++; @@ -1949,7 +2146,7 @@ out: /* * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be + * domain. It calculates and returns the amount of weighted load which should be * moved to restore balance via the imbalance parameter. */ static struct sched_group * @@ -1959,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; + unsigned long busiest_load_per_task, busiest_nr_running; + unsigned long this_load_per_task, this_nr_running; int load_idx; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance = 1; + unsigned long leader_nr_running = 0, min_load_per_task = 0; + unsigned long min_nr_running = ULONG_MAX; + struct sched_group *group_min = NULL, *group_leader = NULL; +#endif max_load = this_load = total_load = total_pwr = 0; + busiest_load_per_task = busiest_nr_running = 0; + this_load_per_task = this_nr_running = 0; if (idle == NOT_IDLE) load_idx = sd->busy_idx; else if (idle == NEWLY_IDLE) @@ -1970,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = sd->idle_idx; do { - unsigned long load; + unsigned long load, group_capacity; int local_group; int i; + unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ - avg_load = 0; + sum_weighted_load = sum_nr_running = avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + runqueue_t *rq = cpu_rq(i); + if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; @@ -1990,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load = source_load(i, load_idx); avg_load += load; + sum_nr_running += rq->nr_running; + sum_weighted_load += rq->raw_weighted_load; } total_load += avg_load; @@ -1998,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + if (local_group) { this_load = avg_load; this = group; - } else if (avg_load > max_load) { + this_nr_running = sum_nr_running; + this_load_per_task = sum_weighted_load; + } else if (avg_load > max_load && + sum_nr_running > group_capacity) { max_load = avg_load; busiest = group; + busiest_nr_running = sum_nr_running; + busiest_load_per_task = sum_weighted_load; } + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (this_nr_running >= group_capacity || + !this_nr_running)) + power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!power_savings_balance || sum_nr_running >= group_capacity + || !sum_nr_running) + goto group_next; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && + first_cpu(group->cpumask) < + first_cpu(group_min->cpumask))) { + group_min = group; + min_nr_running = sum_nr_running; + min_load_per_task = sum_weighted_load / + sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } + +group_next: +#endif group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) + if (!busiest || this_load >= max_load || busiest_nr_running == 0) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -2017,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; + busiest_load_per_task /= busiest_nr_running; /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to @@ -2028,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + if (max_load <= busiest_load_per_task) + goto out_balanced; + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (max_load < avg_load) { + *imbalance = 0; + goto small_imbalance; + } /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; - if (*imbalance < SCHED_LOAD_SCALE) { - unsigned long pwr_now = 0, pwr_move = 0; + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < busiest_load_per_task) { + unsigned long pwr_now, pwr_move; unsigned long tmp; + unsigned int imbn; + +small_imbalance: + pwr_move = pwr_now = 0; + imbn = 2; + if (this_nr_running) { + this_load_per_task /= this_nr_running; + if (busiest_load_per_task > this_load_per_task) + imbn = 1; + } else + this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; + if (max_load - this_load >= busiest_load_per_task * imbn) { + *imbalance = busiest_load_per_task; return busiest; } @@ -2052,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * moving them. */ - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now += busiest->cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); + pwr_move += busiest->cpu_power * + min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ if (max_load*busiest->cpu_power < - SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) + busiest_load_per_task*SCHED_LOAD_SCALE) tmp = max_load*busiest->cpu_power/this->cpu_power; else - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move <= pwr_now) goto out_balanced; - *imbalance = 1; - return busiest; + *imbalance = busiest_load_per_task; } - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = *imbalance / SCHED_LOAD_SCALE; return busiest; out_balanced: +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto ret; + if (this == group_leader && group_leader != group_min) { + *imbalance = min_load_per_task; + return group_min; + } +ret: +#endif *imbalance = 0; return NULL; } @@ -2093,18 +2406,21 @@ out_balanced: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle) + enum idle_type idle, unsigned long imbalance) { - unsigned long load, max_load = 0; - runqueue_t *busiest = NULL; + unsigned long max_load = 0; + runqueue_t *busiest = NULL, *rqi; int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); + rqi = cpu_rq(i); + + if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) + continue; - if (load > max_load) { - max_load = load; - busiest = cpu_rq(i); + if (rqi->raw_weighted_load > max_load) { + max_load = rqi->raw_weighted_load; + busiest = rqi; } } @@ -2117,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, */ #define MAX_PINNED_INTERVAL 512 +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2133,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, int active_balance = 0; int sd_idle = 0; - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2144,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group, idle); + busiest = find_busiest_queue(group, idle, imbalance); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2164,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); @@ -2221,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) return -1; return nr_moved; @@ -2236,7 +2556,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; return 0; } @@ -2257,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, int nr_moved = 0; int sd_idle = 0; - if (sd->flags & SD_SHARE_CPUPOWER) + if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2267,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE); + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2282,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); } @@ -2297,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; sd->nr_balance_failed = 0; return 0; @@ -2352,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ - for_each_domain(target_cpu, sd) + for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpu_isset(busiest_cpu, sd->span)) break; + } if (unlikely(sd == NULL)) goto out; schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2390,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd; int i; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + this_load = this_rq->raw_weighted_load; /* Update our load */ for (i = 0; i < 3; i++) { unsigned long new_load = this_load; @@ -2691,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) resched_task(rq->idle); } -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +/* + * Called with interrupt disabled and this_rq's runqueue locked. + */ +static void wake_sleeping_dependent(int this_cpu) { struct sched_domain *tmp, *sd = NULL; - cpumask_t sibling_map; int i; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) + for_each_domain(this_cpu, tmp) { + if (tmp->flags & SD_SHARE_CPUPOWER) { sd = tmp; + break; + } + } if (!sd) return; - /* - * Unlock the current runqueue because we have to lock in - * CPU order to avoid deadlocks. Caller knows that we might - * unlock. We keep IRQs disabled. - */ - spin_unlock(&this_rq->lock); - - sibling_map = sd->span; - - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - /* - * We clear this CPU from the mask. This both simplifies the - * inner loop and keps this_rq locked when we exit: - */ - cpu_clear(this_cpu, sibling_map); - - for_each_cpu_mask(i, sibling_map) { + for_each_cpu_mask(i, sd->span) { runqueue_t *smt_rq = cpu_rq(i); + if (i == this_cpu) + continue; + if (unlikely(!spin_trylock(&smt_rq->lock))) + continue; + wakeup_busy_runqueue(smt_rq); + spin_unlock(&smt_rq->lock); } - - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); - /* - * We exit with this_cpu's rq still held and IRQs - * still disabled: - */ } /* @@ -2745,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) return p->time_slice * (100 - sd->per_cpu_gain) / 100; } -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +/* + * To minimise lock contention and not have to drop this_rq's runlock we only + * trylock the sibling runqueues and bypass those runqueues if we fail to + * acquire their lock. As we only trylock the normal locking order does not + * need to be obeyed. + */ +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) { struct sched_domain *tmp, *sd = NULL; - cpumask_t sibling_map; - prio_array_t *array; int ret = 0, i; - task_t *p; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) + /* kernel/rt threads do not participate in dependent sleeping */ + if (!p->mm || rt_task(p)) + return 0; + + for_each_domain(this_cpu, tmp) { + if (tmp->flags & SD_SHARE_CPUPOWER) { sd = tmp; + break; + } + } if (!sd) return 0; - /* - * The same locking rules and details apply as for - * wake_sleeping_dependent(): - */ - spin_unlock(&this_rq->lock); - sibling_map = sd->span; - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sd->span) { + runqueue_t *smt_rq; + task_t *smt_curr; - /* - * Establish next task to be run - it might have gone away because - * we released the runqueue lock above: - */ - if (!this_rq->nr_running) - goto out_unlock; - array = this_rq->active; - if (!array->nr_active) - array = this_rq->expired; - BUG_ON(!array->nr_active); + if (i == this_cpu) + continue; - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + smt_rq = cpu_rq(i); + if (unlikely(!spin_trylock(&smt_rq->lock))) + continue; - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq = cpu_rq(i); - task_t *smt_curr = smt_rq->curr; + smt_curr = smt_rq->curr; - /* Kernel threads do not participate in dependent sleeping */ - if (!p->mm || !smt_curr->mm || rt_task(p)) - goto check_smt_task; + if (!smt_curr->mm) + goto unlock; /* * If a user task with lower static priority than the @@ -2808,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) if ((jiffies % DEF_TIMESLICE) > (sd->per_cpu_gain * DEF_TIMESLICE / 100)) ret = 1; - } else + } else { if (smt_curr->static_prio < p->static_prio && !TASK_PREEMPTS_CURR(p, smt_rq) && smt_slice(smt_curr, sd) > task_timeslice(p)) ret = 1; - -check_smt_task: - if ((!smt_curr->mm && smt_curr != smt_rq->idle) || - rt_task(smt_curr)) - continue; - if (!p->mm) { - wakeup_busy_runqueue(smt_rq); - continue; - } - - /* - * Reschedule a lower priority task on the SMT sibling for - * it to be put to sleep, or wake it up if it has been put to - * sleep for priority reasons to see if it should run now. - */ - if (rt_task(p)) { - if ((jiffies % DEF_TIMESLICE) > - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) - resched_task(smt_curr); - } else { - if (TASK_PREEMPTS_CURR(p, smt_rq) && - smt_slice(p, sd) > task_timeslice(smt_curr)) - resched_task(smt_curr); - else - wakeup_busy_runqueue(smt_rq); } +unlock: + spin_unlock(&smt_rq->lock); } -out_unlock: - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); return ret; } #else -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +static inline void wake_sleeping_dependent(int this_cpu) { } -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, + task_t *p) { return 0; } @@ -2972,32 +3251,13 @@ need_resched_nonpreemptible: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -go_idle: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - /* - * wake_sleeping_dependent() might have released - * the runqueue, so break out if we got new - * tasks meanwhile: - */ - if (!rq->nr_running) - goto switch_tasks; - } - } else { - if (dependent_sleeper(cpu, rq)) { - next = rq->idle; + wake_sleeping_dependent(cpu); goto switch_tasks; } - /* - * dependent_sleeper() releases and reacquires the runqueue - * lock, hence go into the idle loop if the rq went - * empty meanwhile: - */ - if (unlikely(!rq->nr_running)) - goto go_idle; } array = rq->active; @@ -3035,6 +3295,8 @@ go_idle: } } next->sleep_type = SLEEP_NORMAL; + if (dependent_sleeper(cpu, rq, next)) + next = rq->idle; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); @@ -3478,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + int old_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3498,22 +3813,25 @@ void set_user_nice(task_t *p, long nice) * it wont have any effect on scheduling until the task is * not SCHED_NORMAL/SCHED_BATCH: */ - if (rt_task(p)) { + if (has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } array = p->array; - if (array) + if (array) { dequeue_task(p, array); + dec_raw_weighted_load(rq, p); + } - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; if (array) { enqueue_task(p, array); + inc_raw_weighted_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3524,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) out_unlock: task_rq_unlock(rq, &flags); } - EXPORT_SYMBOL(set_user_nice); /* @@ -3639,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; + set_load_weight(p); } /** @@ -3667,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; runqueue_t *rq; + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -3715,14 +4033,20 @@ recheck: if (retval) return retval; /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); + /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } array = p->array; @@ -3743,7 +4067,11 @@ recheck: } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -3765,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) read_unlock_irq(&tasklist_lock); return -ESRCH; } - retval = sched_setscheduler(p, policy, &lparam); + get_task_struct(p); read_unlock_irq(&tasklist_lock); + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); return retval; } @@ -4378,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu) idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -4474,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. */ -static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { runqueue_t *rq_dest, *rq_src; + int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) - return; + return ret; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -4508,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } - + ret = 1; out: double_rq_unlock(rq_src, rq_dest); + return ret; } /* @@ -4580,9 +4914,12 @@ wait_to_die: /* Figure out where task on dead CPU should go, use force if neccessary. */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) { + runqueue_t *rq; + unsigned long flags; int dest_cpu; cpumask_t mask; +restart: /* On same node? */ mask = node_to_cpumask(cpu_to_node(dead_cpu)); cpus_and(mask, mask, tsk->cpus_allowed); @@ -4594,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(tsk, &flags); cpus_setall(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); + task_rq_unlock(rq, &flags); /* * Don't tell them about moving exiting tasks or @@ -4607,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) "longer affine to cpu%d\n", tsk->pid, tsk->comm, dead_cpu); } - __migrate_task(tsk, dead_cpu, dest_cpu); + if (!__migrate_task(tsk, dead_cpu, dest_cpu)) + goto restart; } /* @@ -4734,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int migration_call(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int __cpuinit migration_call(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { int cpu = (long)hcpu; struct task_struct *p; @@ -4805,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, /* Register at highest priority so that task migration (migrate_all_tasks) * happens before everything else. */ -static struct notifier_block migration_notifier = { +static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, .priority = 10 }; @@ -5606,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node) } #endif +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we * can switch it on easily if needed. @@ -5621,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu) #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; +static struct sched_group *sched_group_core_bycpu[NR_CPUS]; #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) @@ -5637,7 +5979,7 @@ static int cpu_to_core_group(int cpu) #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; static int cpu_to_phys_group(int cpu) { #if defined(CONFIG_SCHED_MC) @@ -5694,13 +6036,74 @@ next_sg: } #endif +/* Free memory allocated for various sched_group structures */ +static void free_sched_groups(const cpumask_t *cpu_map) +{ + int cpu; +#ifdef CONFIG_NUMA + int i; + + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif + for_each_cpu_mask(cpu, *cpu_map) { + if (sched_group_phys_bycpu[cpu]) { + kfree(sched_group_phys_bycpu[cpu]); + sched_group_phys_bycpu[cpu] = NULL; + } +#ifdef CONFIG_SCHED_MC + if (sched_group_core_bycpu[cpu]) { + kfree(sched_group_core_bycpu[cpu]); + sched_group_core_bycpu[cpu] = NULL; + } +#endif + } +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -void build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct sched_group *sched_group_phys = NULL; +#ifdef CONFIG_SCHED_MC + struct sched_group *sched_group_core = NULL; +#endif #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -5708,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, - GFP_ATOMIC); + sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return; + return -ENOMEM; } sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif @@ -5738,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map) if (!sched_group_allnodes) { printk(KERN_WARNING "Can not alloc allnodes sched group\n"); - break; + goto error; } sched_group_allnodes_bycpu[i] = sched_group_allnodes; @@ -5759,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map) cpus_and(sd->span, sd->span, *cpu_map); #endif + if (!sched_group_phys) { + sched_group_phys + = kmalloc(sizeof(struct sched_group) * NR_CPUS, + GFP_KERNEL); + if (!sched_group_phys) { + printk (KERN_WARNING "Can not alloc phys sched" + "group\n"); + goto error; + } + sched_group_phys_bycpu[i] = sched_group_phys; + } + p = sd; sd = &per_cpu(phys_domains, i); group = cpu_to_phys_group(i); @@ -5768,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map) sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC + if (!sched_group_core) { + sched_group_core + = kmalloc(sizeof(struct sched_group) * NR_CPUS, + GFP_KERNEL); + if (!sched_group_core) { + printk (KERN_WARNING "Can not alloc core sched" + "group\n"); + goto error; + } + sched_group_core_bycpu[i] = sched_group_core; + } + p = sd; sd = &per_cpu(core_domains, i); group = cpu_to_core_group(i); @@ -5851,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map) domainspan = sched_domain_node_span(i); cpus_and(domainspan, domainspan, *cpu_map); - sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for " + "node %d\n", i); + goto error; + } sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; - if (sd->groups == NULL) { - /* Turn off balancing if we have no groups */ - sd->flags = 0; - } - } - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", i); - continue; } sg->cpu_power = 0; sg->cpumask = nodemask; + sg->next = sg; cpus_or(covered, covered, nodemask); prev = sg; @@ -5887,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(tmp)) continue; - sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sg = kmalloc_node(sizeof(struct sched_group), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for node %d\n", j); - break; + goto error; } sg->cpu_power = 0; sg->cpumask = tmp; + sg->next = prev->next; cpus_or(covered, covered, tmp); prev->next = sg; prev = sg; } - prev->next = sched_group_nodes[i]; } #endif /* Calculate CPU power for physical packages and nodes */ +#ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - int power; struct sched_domain *sd; -#ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; + sd->groups->cpu_power = SCHED_LOAD_SCALE; + } #endif #ifdef CONFIG_SCHED_MC + for_each_cpu_mask(i, *cpu_map) { + int power; + struct sched_domain *sd; sd = &per_cpu(core_domains, i); - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) * SCHED_LOAD_SCALE / 10; sd->groups->cpu_power = power; + } +#endif + for_each_cpu_mask(i, *cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); + if (i != first_cpu(sd->groups->cpumask)) + continue; - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = 0; + if (sched_mc_power_savings || sched_smt_power_savings) { + int j; + + for_each_cpu_mask(j, sd->groups->cpumask) { + struct sched_domain *sd1; + sd1 = &per_cpu(core_domains, j); + /* + * for each core we will add once + * to the group in physical domain + */ + if (j != first_cpu(sd1->groups->cpumask)) + continue; + + if (sched_smt_power_savings) + sd->groups->cpu_power += sd1->groups->cpu_power; + else + sd->groups->cpu_power += SCHED_LOAD_SCALE; + } + } else + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; #else + int power; sd = &per_cpu(phys_domains, i); - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif } @@ -5962,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map) * Tune cache-hot values: */ calibrate_migration_costs(cpu_map); + + return 0; + +error: + free_sched_groups(cpu_map); + return -ENOMEM; } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; + int err; /* * Setup mask for cpus without special case scheduling requirements. @@ -5977,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) */ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - build_sched_domains(&cpu_default_map); + err = build_sched_domains(&cpu_default_map); + + return err; } static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { -#ifdef CONFIG_NUMA - int i; - int cpu; - - for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - - if (!sched_group_nodes) - continue; - - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -#endif + free_sched_groups(cpu_map); } /* @@ -6046,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * correct sched domains * Call with hotplug lock held */ -void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) { cpumask_t change_map; + int err = 0; cpus_and(*partition1, *partition1, cpu_online_map); cpus_and(*partition2, *partition2, cpu_online_map); @@ -6057,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) /* Detach sched domains from all of the affected cpus */ detach_destroy_domains(&change_map); if (!cpus_empty(*partition1)) - build_sched_domains(partition1); - if (!cpus_empty(*partition2)) - build_sched_domains(partition2); + err = build_sched_domains(partition1); + if (!err && !cpus_empty(*partition2)) + err = build_sched_domains(partition2); + + return err; +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +int arch_reinit_sched_domains(void) +{ + int err; + + lock_cpu_hotplug(); + detach_destroy_domains(&cpu_online_map); + err = arch_init_sched_domains(&cpu_online_map); + unlock_cpu_hotplug(); + + return err; +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + int ret; + + if (buf[0] != '0' && buf[0] != '1') + return -EINVAL; + + if (smt) + sched_smt_power_savings = (buf[0] == '1'); + else + sched_mc_power_savings = (buf[0] == '1'); + + ret = arch_reinit_sched_domains(); + + return ret ? ret : count; +} + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); } +SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + #ifdef CONFIG_HOTPLUG_CPU /* @@ -6143,7 +6650,6 @@ void __init sched_init(void) rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); - rq->cpu = i; #endif atomic_set(&rq->nr_iowait, 0); @@ -6158,6 +6664,7 @@ void __init sched_init(void) } } + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: */ @@ -6204,11 +6711,12 @@ void normalize_rt_tasks(void) runqueue_t *rq; read_lock_irq(&tasklist_lock); - for_each_process (p) { + for_each_process(p) { if (!rt_task(p)) continue; - rq = task_rq_lock(p, &flags); + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); array = p->array; if (array) @@ -6219,7 +6727,8 @@ void normalize_rt_tasks(void) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); } read_unlock_irq(&tasklist_lock); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 9e2f1c6e73d..8f03e3b89b5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __devinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b5c3b94e01c..6b76caa2298 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __devinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f1a4eb1a655..93a2c539864 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -133,6 +133,10 @@ extern int acct_parm[]; extern int no_unaligned_warning; #endif +#ifdef CONFIG_RT_MUTEXES +extern int max_lock_depth; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -688,6 +692,17 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_RT_MUTEXES + { + .ctl_name = KERN_MAX_LOCK_DEPTH, + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; @@ -928,6 +943,18 @@ static ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif +#ifdef CONFIG_X86_32 + { + .ctl_name = VM_VDSO_ENABLED, + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#endif { .ctl_name = 0 } }; diff --git a/kernel/timer.c b/kernel/timer.c index 5bb6b7976ee..5a896025306 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int timer_cpu_notify(struct notifier_block *self, +static int __devinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1672,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block timers_nb = { +static struct notifier_block __devinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 565cf7a1feb..59f0b42bd89 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) } /* We're holding the cpucontrol mutex here */ -static int workqueue_cpu_callback(struct notifier_block *nfb, +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { diff --git a/lib/Kconfig b/lib/Kconfig index 3de93357f5a..f6299342b88 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -86,4 +86,10 @@ config TEXTSEARCH_BM config TEXTSEARCH_FSM tristate +# +# plist support is select#ed if needed +# +config PLIST + boolean + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8bab0102ac7..5330911ebd3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -107,6 +107,24 @@ config DEBUG_MUTEXES This allows mutex semantics violations and mutex related deadlocks (lockups) to be detected and reported automatically. +config DEBUG_RT_MUTEXES + bool "RT Mutex debugging, deadlock detection" + depends on DEBUG_KERNEL && RT_MUTEXES + help + This allows rt mutex semantics violations and rt mutex related + deadlocks (lockups) to be detected and reported automatically. + +config DEBUG_PI_LIST + bool + default y + depends on DEBUG_RT_MUTEXES + +config RT_MUTEX_TESTER + bool "Built-in scriptable tester for rt-mutexes" + depends on DEBUG_KERNEL && RT_MUTEXES + help + This option enables a rt-mutex tester. + config DEBUG_SPINLOCK bool "Spinlock debugging" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 79358ad1f11..10c13c9d782 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,6 +25,7 @@ lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o +obj-$(CONFIG_PLIST) += plist.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o ifneq ($(CONFIG_HAVE_DEC_LOCK),y) diff --git a/lib/plist.c b/lib/plist.c new file mode 100644 index 00000000000..3074a02272f --- /dev/null +++ b/lib/plist.c @@ -0,0 +1,118 @@ +/* + * lib/plist.c + * + * Descending-priority-sorted double-linked list + * + * (C) 2002-2003 Intel Corp + * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. + * + * 2001-2005 (c) MontaVista Software, Inc. + * Daniel Walker <dwalker@mvista.com> + * + * (C) 2005 Thomas Gleixner <tglx@linutronix.de> + * + * Simplifications of the original code by + * Oleg Nesterov <oleg@tv-sign.ru> + * + * Licensed under the FSF's GNU Public License v2 or later. + * + * Based on simple lists (include/linux/list.h). + * + * This file contains the add / del functions which are considered to + * be too large to inline. See include/linux/plist.h for further + * information. + */ + +#include <linux/plist.h> +#include <linux/spinlock.h> + +#ifdef CONFIG_DEBUG_PI_LIST + +static void plist_check_prev_next(struct list_head *t, struct list_head *p, + struct list_head *n) +{ + if (n->prev != p || p->next != n) { + printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev); + printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev); + printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev); + WARN_ON(1); + } +} + +static void plist_check_list(struct list_head *top) +{ + struct list_head *prev = top, *next = top->next; + + plist_check_prev_next(top, prev, next); + while (next != top) { + prev = next; + next = prev->next; + plist_check_prev_next(top, prev, next); + } +} + +static void plist_check_head(struct plist_head *head) +{ + WARN_ON(!head->lock); + if (head->lock) + WARN_ON_SMP(!spin_is_locked(head->lock)); + plist_check_list(&head->prio_list); + plist_check_list(&head->node_list); +} + +#else +# define plist_check_head(h) do { } while (0) +#endif + +/** + * plist_add - add @node to @head + * + * @node: &struct plist_node pointer + * @head: &struct plist_head pointer + */ +void plist_add(struct plist_node *node, struct plist_head *head) +{ + struct plist_node *iter; + + plist_check_head(head); + WARN_ON(!plist_node_empty(node)); + + list_for_each_entry(iter, &head->prio_list, plist.prio_list) { + if (node->prio < iter->prio) + goto lt_prio; + else if (node->prio == iter->prio) { + iter = list_entry(iter->plist.prio_list.next, + struct plist_node, plist.prio_list); + goto eq_prio; + } + } + +lt_prio: + list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); +eq_prio: + list_add_tail(&node->plist.node_list, &iter->plist.node_list); + + plist_check_head(head); +} + +/** + * plist_del - Remove a @node from plist. + * + * @node: &struct plist_node pointer - entry to be removed + * @head: &struct plist_head pointer - list head + */ +void plist_del(struct plist_node *node, struct plist_head *head) +{ + plist_check_head(head); + + if (!list_empty(&node->plist.prio_list)) { + struct plist_node *next = plist_first(&node->plist); + + list_move_tail(&next->plist.prio_list, &node->plist.prio_list); + list_del_init(&node->plist.prio_list); + } + + list_del_init(&node->plist.node_list); + + plist_check_head(head); +} diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 02a16eacb72..d84560c076d 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -63,10 +63,10 @@ bytes, which is the maximum length that can be coded. inflate_fast() requires strm->avail_out >= 258 for each loop to avoid checking for output space. + + - @start: inflate()'s starting value for strm->avail_out */ -void inflate_fast(strm, start) -z_streamp strm; -unsigned start; /* inflate()'s starting value for strm->avail_out */ +void inflate_fast(z_streamp strm, unsigned start) { struct inflate_state *state; unsigned char *in; /* local strm->next_in */ diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c index da665fbb16a..3fe6ce5b53e 100644 --- a/lib/zlib_inflate/inftrees.c +++ b/lib/zlib_inflate/inftrees.c @@ -20,13 +20,8 @@ table index bits. It will differ if the request is greater than the longest code or if it is less than the shortest code. */ -int zlib_inflate_table(type, lens, codes, table, bits, work) -codetype type; -unsigned short *lens; -unsigned codes; -code **table; -unsigned *bits; -unsigned short *work; +int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes, + code **table, unsigned *bits, unsigned short *work) { unsigned len; /* a code's length in bits */ unsigned sym; /* index of code symbols */ diff --git a/mm/Kconfig b/mm/Kconfig index 66e65ab3942..e76c023eb0b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -116,6 +116,7 @@ config SPARSEMEM_EXTREME config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + depends on (IA64 || X86 || PPC64) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND diff --git a/mm/filemap.c b/mm/filemap.c index 9c7334bafda..d504d6e9888 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2095,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, do { unsigned long index; unsigned long offset; - unsigned long maxlen; size_t copied; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; + + /* Limit the size of the copy to the caller's write size */ + bytes = min(bytes, count); + + /* + * Limit the size of the copy to that of the current segment, + * because fault_in_pages_readable() doesn't know how to walk + * segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); /* * Bring in the user page that we will copy from _first_. @@ -2110,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * same page as we're writing to, without it being marked * up-to-date. */ - maxlen = cur_iov->iov_len - iov_base; - if (maxlen > bytes) - maxlen = bytes; - fault_in_pages_readable(buf, maxlen); + fault_in_pages_readable(buf, bytes); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 841a077d5ae..ea4038838b0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -21,6 +21,7 @@ #include <linux/memory_hotplug.h> #include <linux/highmem.h> #include <linux/vmalloc.h> +#include <linux/ioport.h> #include <asm/tlbflush.h> @@ -126,6 +127,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) unsigned long i; unsigned long flags; unsigned long onlined_pages = 0; + struct resource res; + u64 section_end; + unsigned long start_pfn; struct zone *zone; int need_zonelists_rebuild = 0; @@ -148,10 +152,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (!populated_zone(zone)) need_zonelists_rebuild = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pfn_to_page(pfn + i); - online_page(page); - onlined_pages++; + res.start = (u64)pfn << PAGE_SHIFT; + res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; + res.flags = IORESOURCE_MEM; /* we just need system ram */ + section_end = res.end; + + while (find_next_system_ram(&res) >= 0) { + start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); + nr_pages = (unsigned long) + ((res.end + 1 - res.start) >> PAGE_SHIFT); + + if (PageReserved(pfn_to_page(start_pfn))) { + /* this region's page is not onlined now */ + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(start_pfn + i); + online_page(page); + onlined_pages++; + } + } + + res.start = res.end + 1; + res.end = section_end; } zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; @@ -163,3 +184,100 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) vm_total_pages = nr_free_pagecache_pages(); return 0; } + +static pg_data_t *hotadd_new_pgdat(int nid, u64 start) +{ + struct pglist_data *pgdat; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; + unsigned long start_pfn = start >> PAGE_SHIFT; + + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; + + arch_refresh_nodedata(nid, pgdat); + + /* we can use NODE_DATA(nid) from here */ + + /* init node's zones as empty zones, we don't have any present pages.*/ + free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); + + return pgdat; +} + +static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +{ + arch_refresh_nodedata(nid, NULL); + arch_free_nodedata(pgdat); + return; +} + +/* add this memory to iomem resource */ +static void register_memory_resource(u64 start, u64 size) +{ + struct resource *res; + + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(!res); + + res->name = "System RAM"; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (request_resource(&iomem_resource, res) < 0) { + printk("System RAM resource %llx - %llx cannot be added\n", + (unsigned long long)res->start, (unsigned long long)res->end); + kfree(res); + } +} + + + +int add_memory(int nid, u64 start, u64 size) +{ + pg_data_t *pgdat = NULL; + int new_pgdat = 0; + int ret; + + if (!node_online(nid)) { + pgdat = hotadd_new_pgdat(nid, start); + if (!pgdat) + return -ENOMEM; + new_pgdat = 1; + ret = kswapd_run(nid); + if (ret) + goto error; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size); + + if (ret < 0) + goto error; + + /* we online node here. we can't roll back from here. */ + node_set_online(nid); + + if (new_pgdat) { + ret = register_one_node(nid); + /* + * If sysfs file of new node can't create, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + */ + BUG_ON(ret); + } + + /* register this memory as resource */ + register_memory_resource(start, size); + + return ret; +error: + /* rollback pgdat allocation and others */ + if (new_pgdat) + rollback_node_hotadd(nid, pgdat); + + return ret; +} +EXPORT_SYMBOL_GPL(add_memory); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8ccf6f1b147..4ec7026c7ba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -516,14 +516,14 @@ static void set_ratelimit(void) ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; } -static int +static int __cpuinit ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) { set_ratelimit(); return 0; } -static struct notifier_block ratelimit_nb = { +static struct notifier_block __cpuinitdata ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f86191bb63..084a2de7e52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -446,8 +446,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); if (!PageHighMem(page)) - mutex_debug_check_no_locks_freed(page_address(page), - PAGE_SIZE<<order); + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE<<order); for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); @@ -2009,7 +2009,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int pageset_cpuup_callback(struct notifier_block *nfb, +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -2031,7 +2031,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb, return ret; } -static struct notifier_block pageset_notifier = +static struct notifier_block __cpuinitdata pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; void __init setup_per_cpu_pageset(void) diff --git a/mm/slab.c b/mm/slab.c index 98ac20bc0de..233e39d14ca 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -89,6 +89,7 @@ #include <linux/config.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/poison.h> #include <linux/swap.h> #include <linux/cache.h> #include <linux/interrupt.h> @@ -106,6 +107,7 @@ #include <linux/nodemask.h> #include <linux/mempolicy.h> #include <linux/mutex.h> +#include <linux/rtmutex.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -492,17 +494,6 @@ struct kmem_cache { #endif #if DEBUG -/* - * Magic nums for obj red zoning. - * Placed in the first word before and the first word after an obj. - */ -#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ -#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ - -/* ...and for poisoning */ -#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ -#define POISON_FREE 0x6b /* for use-after-free poisoning */ -#define POISON_END 0xa5 /* end-byte of poisoning */ /* * memory layout of objects: @@ -1083,7 +1074,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) #endif -static int cpuup_callback(struct notifier_block *nfb, +static int __devinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1265,7 +1256,9 @@ bad: return NOTIFY_BAD; } -static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; +static struct notifier_block __cpuinitdata cpucache_notifier = { + &cpuup_callback, NULL, 0 +}; /* * swap the static kmem_list3 with kmalloced memory @@ -3405,7 +3398,7 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); - mutex_debug_check_no_locks_freed(objp, obj_size(c)); + debug_check_no_locks_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); } diff --git a/mm/sparse.c b/mm/sparse.c index e0a3fe48aa3..c7a2b3a0e46 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -45,7 +45,7 @@ static struct mem_section *sparse_index_alloc(int nid) static int sparse_index_init(unsigned long section_nr, int nid) { - static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; int ret = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 72babac71de..eeacb0d695c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -34,6 +34,7 @@ #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> +#include <linux/kthread.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1223,7 +1224,6 @@ static int kswapd(void *p) }; cpumask_t cpumask; - daemonize("kswapd%d", pgdat->node_id); cpumask = node_to_cpumask(pgdat->node_id); if (!cpus_empty(cpumask)) set_cpus_allowed(tsk, cpumask); @@ -1450,7 +1450,7 @@ out: not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { pg_data_t *pgdat; @@ -1468,20 +1468,35 @@ static int cpu_callback(struct notifier_block *nfb, } #endif /* CONFIG_HOTPLUG_CPU */ +/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ +int kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kswapd) + return 0; + + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; + } + return ret; +} + static int __init kswapd_init(void) { - pg_data_t *pgdat; + int nid; swap_setup(); - for_each_online_pgdat(pgdat) { - pid_t pid; - - pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); - BUG_ON(pid < 0); - read_lock(&tasklist_lock); - pgdat->kswapd = find_task_by_pid(pid); - read_unlock(&tasklist_lock); - } + for_each_online_node(nid) + kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8a777932786..e728980160d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -349,7 +349,7 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif, (strict & RT6_SELECT_F_REACHABLE) && last && last != rt0) { /* no entries matched; do round-robin */ - static spinlock_t lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(lock); spin_lock(&lock); *head = rt0->u.next; rt0->u.next = last->u.next; diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index f43311221a7..2f312164d6d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -70,7 +70,7 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(krb5_seq_lock); u32 gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 54128040a12..1bb75703f38 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -117,7 +117,7 @@ struct bclink { static struct bcbearer *bcbearer = NULL; static struct bclink *bclink = NULL; static struct link *bcl = NULL; -static spinlock_t bc_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(bc_lock); char tipc_bclink_name[] = "multicast-link"; @@ -796,7 +796,7 @@ int tipc_bclink_init(void) memset(bclink, 0, sizeof(struct bclink)); INIT_LIST_HEAD(&bcl->waiting_ports); bcl->next_out_no = 1; - bclink->node.lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&bclink->node.lock); bcl->owner = &bclink->node; bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 4fa24b5e891..7ef17a449cf 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -566,7 +566,7 @@ restart: b_ptr->link_req = tipc_disc_init_link_req(b_ptr, &m_ptr->bcast_addr, bcast_scope, 2); } - b_ptr->publ.lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&b_ptr->publ.lock); write_unlock_bh(&tipc_net_lock); info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, addr_string_fill(addr_string, bcast_scope), priority); diff --git a/net/tipc/config.c b/net/tipc/config.c index 3ec502fac8c..285e1bc2d88 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -63,7 +63,7 @@ struct manager { static struct manager mng = { 0}; -static spinlock_t config_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(config_lock); static const void *req_tlv_area; /* request message TLV area */ static int req_tlv_space; /* request message TLV area size */ diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c index 26ef95d5fe3..55130655e1e 100644 --- a/net/tipc/dbg.c +++ b/net/tipc/dbg.c @@ -41,7 +41,7 @@ #define MAX_STRING 512 static char print_string[MAX_STRING]; -static spinlock_t print_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(print_lock); static struct print_buf cons_buf = { NULL, 0, NULL, NULL }; struct print_buf *TIPC_CONS = &cons_buf; diff --git a/net/tipc/handler.c b/net/tipc/handler.c index 966f70a1b60..ae6ddf00a1a 100644 --- a/net/tipc/handler.c +++ b/net/tipc/handler.c @@ -44,7 +44,7 @@ struct queue_item { static kmem_cache_t *tipc_queue_item_cache; static struct list_head signal_queue_head; -static spinlock_t qitem_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(qitem_lock); static int handler_enabled = 0; static void process_signal_queue(unsigned long dummy); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 38571306aba..a6926ff07bc 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -101,7 +101,7 @@ struct name_table { static struct name_table table = { NULL } ; static atomic_t rsv_publ_ok = ATOMIC_INIT(0); -rwlock_t tipc_nametbl_lock = RW_LOCK_UNLOCKED; +DEFINE_RWLOCK(tipc_nametbl_lock); static int hash(int x) @@ -172,7 +172,7 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea } memset(nseq, 0, sizeof(*nseq)); - nseq->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&nseq->lock); nseq->type = type; nseq->sseqs = sseq; dbg("tipc_nameseq_create(): nseq = %p, type %u, ssseqs %p, ff: %u\n", diff --git a/net/tipc/net.c b/net/tipc/net.c index f7c8223ddf7..e5a359ab493 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -115,7 +115,7 @@ * - A local spin_lock protecting the queue of subscriber events. */ -rwlock_t tipc_net_lock = RW_LOCK_UNLOCKED; +DEFINE_RWLOCK(tipc_net_lock); struct network tipc_net = { NULL }; struct node *tipc_net_select_remote_node(u32 addr, u32 ref) diff --git a/net/tipc/node.c b/net/tipc/node.c index ce9678efa98..861322b935d 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -77,7 +77,7 @@ struct node *tipc_node_create(u32 addr) memset(n_ptr, 0, sizeof(*n_ptr)); n_ptr->addr = addr; - n_ptr->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&n_ptr->lock); INIT_LIST_HEAD(&n_ptr->nsub); n_ptr->owner = c_ptr; tipc_cltr_attach_node(c_ptr, n_ptr); diff --git a/net/tipc/port.c b/net/tipc/port.c index 47d97404e3e..3251c8d8e53 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -57,8 +57,8 @@ static struct sk_buff *msg_queue_head = NULL; static struct sk_buff *msg_queue_tail = NULL; -spinlock_t tipc_port_list_lock = SPIN_LOCK_UNLOCKED; -static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(tipc_port_list_lock); +static DEFINE_SPINLOCK(queue_lock); static LIST_HEAD(ports); static void port_handle_node_down(unsigned long ref); diff --git a/net/tipc/ref.c b/net/tipc/ref.c index d2f0cce10e2..596d3c8ff75 100644 --- a/net/tipc/ref.c +++ b/net/tipc/ref.c @@ -63,7 +63,7 @@ struct ref_table tipc_ref_table = { NULL }; -static rwlock_t ref_table_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(ref_table_lock); /** * tipc_ref_table_init - create reference table for objects @@ -87,7 +87,7 @@ int tipc_ref_table_init(u32 requested_size, u32 start) index_mask = sz - 1; for (i = sz - 1; i >= 0; i--) { table[i].object = NULL; - table[i].lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&table[i].lock); table[i].data.next_plus_upper = (start & ~index_mask) + i - 1; } tipc_ref_table.entries = table; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index fc171875660..e19b4bcd67e 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -457,7 +457,7 @@ int tipc_subscr_start(void) int res = -1; memset(&topsrv, 0, sizeof (topsrv)); - topsrv.lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&topsrv.lock); INIT_LIST_HEAD(&topsrv.subscriber_list); spin_lock_bh(&topsrv.lock); diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c index 3f3f933976e..1e3ae57c722 100644 --- a/net/tipc/user_reg.c +++ b/net/tipc/user_reg.c @@ -67,7 +67,7 @@ struct tipc_user { static struct tipc_user *users = NULL; static u32 next_free_user = MAX_USERID + 1; -static spinlock_t reg_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(reg_lock); /** * reg_init - create TIPC user registry (but don't activate it) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index ac5f275b028..b0d067be739 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -13,11 +13,6 @@ space := $(empty) $(empty) depfile = $(subst $(comma),_,$(@D)/.$(@F).d) ### -# basetarget equals the filename of the target with no extension. -# So 'foo/bar.o' becomes 'bar' -basetarget = $(basename $(notdir $@)) - -### # Escape single quote for use in echo statements escsq = $(subst $(squote),'\$(squote)',$1) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 3cb445cc743..02a7eea5fdb 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -117,7 +117,7 @@ $(real-objs-m:.o=.lst): quiet_modtag := [M] $(obj-m) : quiet_modtag := [M] # Default for not multi-part modules -modname = $(basetarget) +modname = $(*F) $(multi-objs-m) : modname = $(modname-multi) $(multi-objs-m:.o=.i) : modname = $(modname-multi) diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 18ecd4d5df7..2b066d12af2 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -80,10 +80,8 @@ obj-dirs += $(host-objdirs) ##### # Handle options to gcc. Support building with separate output directory -_hostc_flags = $(HOSTCFLAGS) $(HOST_EXTRACFLAGS) \ - $(HOSTCFLAGS_$(basetarget).o) -_hostcxx_flags = $(HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) \ - $(HOSTCXXFLAGS_$(basetarget).o) +_hostc_flags = $(HOSTCFLAGS) $(HOST_EXTRACFLAGS) $(HOSTCFLAGS_$(*F).o) +_hostcxx_flags = $(HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) $(HOSTCXXFLAGS_$(*F).o) ifeq ($(KBUILD_SRC),) __hostc_flags = $(_hostc_flags) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index fc498fee68e..2cb4935e85d 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -82,12 +82,12 @@ obj-dirs := $(addprefix $(obj)/,$(obj-dirs)) # than one module. In that case KBUILD_MODNAME will be set to foo_bar, # where foo and bar are the name of the modules. name-fix = $(subst $(comma),_,$(subst -,_,$1)) -basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))" +basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(*F)))" modname_flags = $(if $(filter 1,$(words $(modname))),\ -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") -_c_flags = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(basetarget).o) -_a_flags = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(basetarget).o) +_c_flags = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(*F).o) +_a_flags = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(*F).o) _cpp_flags = $(CPPFLAGS) $(EXTRA_CPPFLAGS) $(CPPFLAGS_$(@F)) # If building the kernel in a separate objtree expand all occurrences diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index e83613e0e82..576cce5e387 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -72,7 +72,7 @@ $(modules:.ko=.mod.c): __modpost ; # Step 5), compile all *.mod.c files # modname is set to make c_flags define KBUILD_MODNAME -modname = $(basetarget) +modname = $(*F) quiet_cmd_cc_o_c = CC $@ cmd_cc_o_c = $(CC) $(c_flags) $(CFLAGS_MODULE) \ diff --git a/scripts/rt-tester/check-all.sh b/scripts/rt-tester/check-all.sh new file mode 100644 index 00000000000..43098afe743 --- /dev/null +++ b/scripts/rt-tester/check-all.sh @@ -0,0 +1,22 @@ + + +function testit () +{ + printf "%-30s: " $1 + ./rt-tester.py $1 | grep Pass +} + +testit t2-l1-2rt-sameprio.tst +testit t2-l1-pi.tst +testit t2-l1-signal.tst +#testit t2-l2-2rt-deadlock.tst +testit t3-l1-pi-1rt.tst +testit t3-l1-pi-2rt.tst +testit t3-l1-pi-3rt.tst +testit t3-l1-pi-signal.tst +testit t3-l1-pi-steal.tst +testit t3-l2-pi.tst +testit t4-l2-pi-deboost.tst +testit t5-l4-pi-boost-deboost.tst +testit t5-l4-pi-boost-deboost-setsched.tst + diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py new file mode 100644 index 00000000000..4c79660793c --- /dev/null +++ b/scripts/rt-tester/rt-tester.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python +# +# rt-mutex tester +# +# (C) 2006 Thomas Gleixner <tglx@linutronix.de> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# +import os +import sys +import getopt +import shutil +import string + +# Globals +quiet = 0 +test = 0 +comments = 0 + +sysfsprefix = "/sys/devices/system/rttest/rttest" +statusfile = "/status" +commandfile = "/command" + +# Command opcodes +cmd_opcodes = { + "schedother" : "1", + "schedfifo" : "2", + "lock" : "3", + "locknowait" : "4", + "lockint" : "5", + "lockintnowait" : "6", + "lockcont" : "7", + "unlock" : "8", + "lockbkl" : "9", + "unlockbkl" : "10", + "signal" : "11", + "resetevent" : "98", + "reset" : "99", + } + +test_opcodes = { + "prioeq" : ["P" , "eq" , None], + "priolt" : ["P" , "lt" , None], + "priogt" : ["P" , "gt" , None], + "nprioeq" : ["N" , "eq" , None], + "npriolt" : ["N" , "lt" , None], + "npriogt" : ["N" , "gt" , None], + "unlocked" : ["M" , "eq" , 0], + "trylock" : ["M" , "eq" , 1], + "blocked" : ["M" , "eq" , 2], + "blockedwake" : ["M" , "eq" , 3], + "locked" : ["M" , "eq" , 4], + "opcodeeq" : ["O" , "eq" , None], + "opcodelt" : ["O" , "lt" , None], + "opcodegt" : ["O" , "gt" , None], + "eventeq" : ["E" , "eq" , None], + "eventlt" : ["E" , "lt" , None], + "eventgt" : ["E" , "gt" , None], + } + +# Print usage information +def usage(): + print "rt-tester.py <-c -h -q -t> <testfile>" + print " -c display comments after first command" + print " -h help" + print " -q quiet mode" + print " -t test mode (syntax check)" + print " testfile: read test specification from testfile" + print " otherwise from stdin" + return + +# Print progress when not in quiet mode +def progress(str): + if not quiet: + print str + +# Analyse a status value +def analyse(val, top, arg): + + intval = int(val) + + if top[0] == "M": + intval = intval / (10 ** int(arg)) + intval = intval % 10 + argval = top[2] + elif top[0] == "O": + argval = int(cmd_opcodes.get(arg, arg)) + else: + argval = int(arg) + + # progress("%d %s %d" %(intval, top[1], argval)) + + if top[1] == "eq" and intval == argval: + return 1 + if top[1] == "lt" and intval < argval: + return 1 + if top[1] == "gt" and intval > argval: + return 1 + return 0 + +# Parse the commandline +try: + (options, arguments) = getopt.getopt(sys.argv[1:],'chqt') +except getopt.GetoptError, ex: + usage() + sys.exit(1) + +# Parse commandline options +for option, value in options: + if option == "-c": + comments = 1 + elif option == "-q": + quiet = 1 + elif option == "-t": + test = 1 + elif option == '-h': + usage() + sys.exit(0) + +# Select the input source +if arguments: + try: + fd = open(arguments[0]) + except Exception,ex: + sys.stderr.write("File not found %s\n" %(arguments[0])) + sys.exit(1) +else: + fd = sys.stdin + +linenr = 0 + +# Read the test patterns +while 1: + + linenr = linenr + 1 + line = fd.readline() + if not len(line): + break + + line = line.strip() + parts = line.split(":") + + if not parts or len(parts) < 1: + continue + + if len(parts[0]) == 0: + continue + + if parts[0].startswith("#"): + if comments > 1: + progress(line) + continue + + if comments == 1: + comments = 2 + + progress(line) + + cmd = parts[0].strip().lower() + opc = parts[1].strip().lower() + tid = parts[2].strip() + dat = parts[3].strip() + + try: + # Test or wait for a status value + if cmd == "t" or cmd == "w": + testop = test_opcodes[opc] + + fname = "%s%s%s" %(sysfsprefix, tid, statusfile) + if test: + print fname + continue + + while 1: + query = 1 + fsta = open(fname, 'r') + status = fsta.readline().strip() + fsta.close() + stat = status.split(",") + for s in stat: + s = s.strip() + if s.startswith(testop[0]): + # Seperate status value + val = s[2:].strip() + query = analyse(val, testop, dat) + break + if query or cmd == "t": + break + + progress(" " + status) + + if not query: + sys.stderr.write("Test failed in line %d\n" %(linenr)) + sys.exit(1) + + # Issue a command to the tester + elif cmd == "c": + cmdnr = cmd_opcodes[opc] + # Build command string and sys filename + cmdstr = "%s:%s" %(cmdnr, dat) + fname = "%s%s%s" %(sysfsprefix, tid, commandfile) + if test: + print fname + continue + fcmd = open(fname, 'w') + fcmd.write(cmdstr) + fcmd.close() + + except Exception,ex: + sys.stderr.write(str(ex)) + sys.stderr.write("\nSyntax error in line %d\n" %(linenr)) + if not test: + fd.close() + sys.exit(1) + +# Normal exit pass +print "Pass" +sys.exit(0) + + diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst new file mode 100644 index 00000000000..8821f27cc8b --- /dev/null +++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst @@ -0,0 +1,99 @@ +# +# RT-Mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal 0 +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 2 threads 1 lock +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedfifo: 0: 80 +C: schedfifo: 1: 80 + +# T0 lock L0 +C: locknowait: 0: 0 +C: locknowait: 1: 0 +W: locked: 0: 0 +W: blocked: 1: 0 +T: prioeq: 0: 80 + +# T0 unlock L0 +C: unlock: 0: 0 +W: locked: 1: 0 + +# Verify T0 +W: unlocked: 0: 0 +T: prioeq: 0: 80 + +# Unlock +C: unlock: 1: 0 +W: unlocked: 1: 0 + +# T1,T0 lock L0 +C: locknowait: 1: 0 +C: locknowait: 0: 0 +W: locked: 1: 0 +W: blocked: 0: 0 +T: prioeq: 1: 80 + +# T1 unlock L0 +C: unlock: 1: 0 +W: locked: 0: 0 + +# Verify T1 +W: unlocked: 1: 0 +T: prioeq: 1: 80 + +# Unlock and exit +C: unlock: 0: 0 +W: unlocked: 0: 0 + diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst new file mode 100644 index 00000000000..cde1f189a02 --- /dev/null +++ b/scripts/rt-tester/t2-l1-pi.tst @@ -0,0 +1,82 @@ +# +# RT-Mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal 0 +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 2 threads 1 lock with priority inversion +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedfifo: 1: 80 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: locknowait: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 80 + +# T0 unlock L0 +C: unlock: 0: 0 +W: locked: 1: 0 + +# Verify T1 +W: unlocked: 0: 0 +T: priolt: 0: 1 + +# Unlock and exit +C: unlock: 1: 0 +W: unlocked: 1: 0 + diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst new file mode 100644 index 00000000000..3ab0bfc4995 --- /dev/null +++ b/scripts/rt-tester/t2-l1-signal.tst @@ -0,0 +1,77 @@ +# +# RT-Mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal 0 +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 2 threads 1 lock with priority inversion +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedother: 1: 0 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: lockintnowait: 1: 0 +W: blocked: 1: 0 + +# Interrupt T1 +C: signal: 1: 0 +W: unlocked: 1: 0 +T: opcodeeq: 1: -4 + +# Unlock and exit +C: unlock: 0: 0 +W: unlocked: 0: 0 diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst new file mode 100644 index 00000000000..f4b5d5d6215 --- /dev/null +++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst @@ -0,0 +1,89 @@ +# +# RT-Mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal 0 +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 2 threads 2 lock +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedfifo: 0: 80 +C: schedfifo: 1: 80 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L1 +C: locknowait: 1: 1 +W: locked: 1: 1 + +# T0 lock L1 +C: lockintnowait: 0: 1 +W: blocked: 0: 1 + +# T1 lock L0 +C: lockintnowait: 1: 0 +W: blocked: 1: 0 + +# Make deadlock go away +C: signal: 1: 0 +W: unlocked: 1: 0 +C: signal: 0: 0 +W: unlocked: 0: 1 + +# Unlock and exit +C: unlock: 0: 0 +W: unlocked: 0: 0 +C: unlock: 1: 1 +W: unlocked: 1: 1 + diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst new file mode 100644 index 00000000000..63440ca2cce --- /dev/null +++ b/scripts/rt-tester/t3-l1-pi-1rt.tst @@ -0,0 +1,92 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 3 threads 1 lock PI +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedother: 1: 0 +C: schedfifo: 2: 82 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: locknowait: 1: 0 +W: blocked: 1: 0 +T: priolt: 0: 1 + +# T2 lock L0 +C: locknowait: 2: 0 +W: blocked: 2: 0 +T: prioeq: 0: 82 + +# T0 unlock L0 +C: unlock: 0: 0 + +# Wait until T2 got the lock +W: locked: 2: 0 +W: unlocked: 0: 0 +T: priolt: 0: 1 + +# T2 unlock L0 +C: unlock: 2: 0 + +W: unlocked: 2: 0 +W: locked: 1: 0 + +C: unlock: 1: 0 +W: unlocked: 1: 0 diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst new file mode 100644 index 00000000000..e5816fe67df --- /dev/null +++ b/scripts/rt-tester/t3-l1-pi-2rt.tst @@ -0,0 +1,93 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 3 threads 1 lock PI +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedfifo: 1: 81 +C: schedfifo: 2: 82 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: locknowait: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 81 + +# T2 lock L0 +C: locknowait: 2: 0 +W: blocked: 2: 0 +T: prioeq: 0: 82 +T: prioeq: 1: 81 + +# T0 unlock L0 +C: unlock: 0: 0 + +# Wait until T2 got the lock +W: locked: 2: 0 +W: unlocked: 0: 0 +T: priolt: 0: 1 + +# T2 unlock L0 +C: unlock: 2: 0 + +W: unlocked: 2: 0 +W: locked: 1: 0 + +C: unlock: 1: 0 +W: unlocked: 1: 0 diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst new file mode 100644 index 00000000000..718b82b5d3b --- /dev/null +++ b/scripts/rt-tester/t3-l1-pi-3rt.tst @@ -0,0 +1,92 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 3 threads 1 lock PI +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedfifo: 0: 80 +C: schedfifo: 1: 81 +C: schedfifo: 2: 82 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: locknowait: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 81 + +# T2 lock L0 +C: locknowait: 2: 0 +W: blocked: 2: 0 +T: prioeq: 0: 82 + +# T0 unlock L0 +C: unlock: 0: 0 + +# Wait until T2 got the lock +W: locked: 2: 0 +W: unlocked: 0: 0 +T: prioeq: 0: 80 + +# T2 unlock L0 +C: unlock: 2: 0 + +W: locked: 1: 0 +W: unlocked: 2: 0 + +C: unlock: 1: 0 +W: unlocked: 1: 0 diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst new file mode 100644 index 00000000000..c6e21356349 --- /dev/null +++ b/scripts/rt-tester/t3-l1-pi-signal.tst @@ -0,0 +1,98 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# Reset event counter +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set priorities +C: schedother: 0: 0 +C: schedfifo: 1: 80 +C: schedfifo: 2: 81 + +# T0 lock L0 +C: lock: 0: 0 +W: locked: 0: 0 + +# T1 lock L0, no wait in the wakeup path +C: locknowait: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 80 +T: prioeq: 1: 80 + +# T2 lock L0 interruptible, no wait in the wakeup path +C: lockintnowait: 2: 0 +W: blocked: 2: 0 +T: prioeq: 0: 81 +T: prioeq: 1: 80 + +# Interrupt T2 +C: signal: 2: 2 +W: unlocked: 2: 0 +T: prioeq: 1: 80 +T: prioeq: 0: 80 + +T: locked: 0: 0 +T: blocked: 1: 0 + +# T0 unlock L0 +C: unlock: 0: 0 + +# Wait until T1 has locked L0 and exit +W: locked: 1: 0 +W: unlocked: 0: 0 +T: priolt: 0: 1 + +C: unlock: 1: 0 +W: unlocked: 1: 0 + + + diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst new file mode 100644 index 00000000000..f53749d59d7 --- /dev/null +++ b/scripts/rt-tester/t3-l1-pi-steal.tst @@ -0,0 +1,96 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 3 threads 1 lock PI steal pending ownership +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedfifo: 1: 80 +C: schedfifo: 2: 81 + +# T0 lock L0 +C: lock: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: lock: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 80 + +# T0 unlock L0 +C: unlock: 0: 0 + +# Wait until T1 is in the wakeup loop +W: blockedwake: 1: 0 +T: priolt: 0: 1 + +# T2 lock L0 +C: lock: 2: 0 +# T1 leave wakeup loop +C: lockcont: 1: 0 + +# T2 must have the lock and T1 must be blocked +W: locked: 2: 0 +W: blocked: 1: 0 + +# T2 unlock L0 +C: unlock: 2: 0 + +# Wait until T1 is in the wakeup loop and let it run +W: blockedwake: 1: 0 +C: lockcont: 1: 0 +W: locked: 1: 0 +C: unlock: 1: 0 +W: unlocked: 1: 0 diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst new file mode 100644 index 00000000000..cdc3e4fd7ba --- /dev/null +++ b/scripts/rt-tester/t3-l2-pi.tst @@ -0,0 +1,92 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 3 threads 2 lock PI +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedother: 1: 0 +C: schedfifo: 2: 82 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L0 +C: locknowait: 1: 0 +W: blocked: 1: 0 +T: priolt: 0: 1 + +# T2 lock L0 +C: locknowait: 2: 0 +W: blocked: 2: 0 +T: prioeq: 0: 82 + +# T0 unlock L0 +C: unlock: 0: 0 + +# Wait until T2 got the lock +W: locked: 2: 0 +W: unlocked: 0: 0 +T: priolt: 0: 1 + +# T2 unlock L0 +C: unlock: 2: 0 + +W: unlocked: 2: 0 +W: locked: 1: 0 + +C: unlock: 1: 0 +W: unlocked: 1: 0 diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst new file mode 100644 index 00000000000..baa14137f47 --- /dev/null +++ b/scripts/rt-tester/t4-l2-pi-deboost.tst @@ -0,0 +1,123 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 4 threads 2 lock PI +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedother: 1: 0 +C: schedfifo: 2: 82 +C: schedfifo: 3: 83 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L1 +C: locknowait: 1: 1 +W: locked: 1: 1 + +# T3 lock L0 +C: lockintnowait: 3: 0 +W: blocked: 3: 0 +T: prioeq: 0: 83 + +# T0 lock L1 +C: lock: 0: 1 +W: blocked: 0: 1 +T: prioeq: 1: 83 + +# T1 unlock L1 +C: unlock: 1: 1 + +# Wait until T0 is in the wakeup code +W: blockedwake: 0: 1 + +# Verify that T1 is unboosted +W: unlocked: 1: 1 +T: priolt: 1: 1 + +# T2 lock L1 (T0 is boosted and pending owner !) +C: locknowait: 2: 1 +W: blocked: 2: 1 +T: prioeq: 0: 83 + +# Interrupt T3 and wait until T3 returned +C: signal: 3: 0 +W: unlocked: 3: 0 + +# Verify prio of T0 (still pending owner, +# but T2 is enqueued due to the previous boost by T3 +T: prioeq: 0: 82 + +# Let T0 continue +C: lockcont: 0: 1 +W: locked: 0: 1 + +# Unlock L1 and let T2 get L1 +C: unlock: 0: 1 +W: locked: 2: 1 + +# Verify that T0 is unboosted +W: unlocked: 0: 1 +T: priolt: 0: 1 + +# Unlock everything and exit +C: unlock: 2: 1 +W: unlocked: 2: 1 + +C: unlock: 0: 0 +W: unlocked: 0: 0 + diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst new file mode 100644 index 00000000000..e6ec0c81b54 --- /dev/null +++ b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst @@ -0,0 +1,183 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 5 threads 4 lock PI - modify priority of blocked threads +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedfifo: 1: 81 +C: schedfifo: 2: 82 +C: schedfifo: 3: 83 +C: schedfifo: 4: 84 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L1 +C: locknowait: 1: 1 +W: locked: 1: 1 + +# T1 lock L0 +C: lockintnowait: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 81 + +# T2 lock L2 +C: locknowait: 2: 2 +W: locked: 2: 2 + +# T2 lock L1 +C: lockintnowait: 2: 1 +W: blocked: 2: 1 +T: prioeq: 0: 82 +T: prioeq: 1: 82 + +# T3 lock L3 +C: locknowait: 3: 3 +W: locked: 3: 3 + +# T3 lock L2 +C: lockintnowait: 3: 2 +W: blocked: 3: 2 +T: prioeq: 0: 83 +T: prioeq: 1: 83 +T: prioeq: 2: 83 + +# T4 lock L3 +C: lockintnowait: 4: 3 +W: blocked: 4: 3 +T: prioeq: 0: 84 +T: prioeq: 1: 84 +T: prioeq: 2: 84 +T: prioeq: 3: 84 + +# Reduce prio of T4 +C: schedfifo: 4: 80 +T: prioeq: 0: 83 +T: prioeq: 1: 83 +T: prioeq: 2: 83 +T: prioeq: 3: 83 +T: prioeq: 4: 80 + +# Increase prio of T4 +C: schedfifo: 4: 84 +T: prioeq: 0: 84 +T: prioeq: 1: 84 +T: prioeq: 2: 84 +T: prioeq: 3: 84 +T: prioeq: 4: 84 + +# Reduce prio of T3 +C: schedfifo: 3: 80 +T: prioeq: 0: 84 +T: prioeq: 1: 84 +T: prioeq: 2: 84 +T: prioeq: 3: 84 +T: prioeq: 4: 84 + +# Increase prio of T3 +C: schedfifo: 3: 85 +T: prioeq: 0: 85 +T: prioeq: 1: 85 +T: prioeq: 2: 85 +T: prioeq: 3: 85 +T: prioeq: 4: 84 + +# Reduce prio of T3 +C: schedfifo: 3: 83 +T: prioeq: 0: 84 +T: prioeq: 1: 84 +T: prioeq: 2: 84 +T: prioeq: 3: 84 +T: prioeq: 4: 84 + +# Signal T4 +C: signal: 4: 0 +W: unlocked: 4: 3 +T: prioeq: 0: 83 +T: prioeq: 1: 83 +T: prioeq: 2: 83 +T: prioeq: 3: 83 + +# Signal T3 +C: signal: 3: 0 +W: unlocked: 3: 2 +T: prioeq: 0: 82 +T: prioeq: 1: 82 +T: prioeq: 2: 82 + +# Signal T2 +C: signal: 2: 0 +W: unlocked: 2: 1 +T: prioeq: 0: 81 +T: prioeq: 1: 81 + +# Signal T1 +C: signal: 1: 0 +W: unlocked: 1: 0 +T: priolt: 0: 1 + +# Unlock and exit +C: unlock: 3: 3 +C: unlock: 2: 2 +C: unlock: 1: 1 +C: unlock: 0: 0 + +W: unlocked: 3: 3 +W: unlocked: 2: 2 +W: unlocked: 1: 1 +W: unlocked: 0: 0 + diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst new file mode 100644 index 00000000000..ca64f8bbf4b --- /dev/null +++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst @@ -0,0 +1,143 @@ +# +# rt-mutex test +# +# Op: C(ommand)/T(est)/W(ait) +# | opcode +# | | threadid: 0-7 +# | | | opcode argument +# | | | | +# C: lock: 0: 0 +# +# Commands +# +# opcode opcode argument +# schedother nice value +# schedfifo priority +# lock lock nr (0-7) +# locknowait lock nr (0-7) +# lockint lock nr (0-7) +# lockintnowait lock nr (0-7) +# lockcont lock nr (0-7) +# unlock lock nr (0-7) +# lockbkl lock nr (0-7) +# unlockbkl lock nr (0-7) +# signal thread to signal (0-7) +# reset 0 +# resetevent 0 +# +# Tests / Wait +# +# opcode opcode argument +# +# prioeq priority +# priolt priority +# priogt priority +# nprioeq normal priority +# npriolt normal priority +# npriogt normal priority +# locked lock nr (0-7) +# blocked lock nr (0-7) +# blockedwake lock nr (0-7) +# unlocked lock nr (0-7) +# lockedbkl dont care +# blockedbkl dont care +# unlockedbkl dont care +# opcodeeq command opcode or number +# opcodelt number +# opcodegt number +# eventeq number +# eventgt number +# eventlt number + +# +# 5 threads 4 lock PI +# +C: resetevent: 0: 0 +W: opcodeeq: 0: 0 + +# Set schedulers +C: schedother: 0: 0 +C: schedfifo: 1: 81 +C: schedfifo: 2: 82 +C: schedfifo: 3: 83 +C: schedfifo: 4: 84 + +# T0 lock L0 +C: locknowait: 0: 0 +W: locked: 0: 0 + +# T1 lock L1 +C: locknowait: 1: 1 +W: locked: 1: 1 + +# T1 lock L0 +C: lockintnowait: 1: 0 +W: blocked: 1: 0 +T: prioeq: 0: 81 + +# T2 lock L2 +C: locknowait: 2: 2 +W: locked: 2: 2 + +# T2 lock L1 +C: lockintnowait: 2: 1 +W: blocked: 2: 1 +T: prioeq: 0: 82 +T: prioeq: 1: 82 + +# T3 lock L3 +C: locknowait: 3: 3 +W: locked: 3: 3 + +# T3 lock L2 +C: lockintnowait: 3: 2 +W: blocked: 3: 2 +T: prioeq: 0: 83 +T: prioeq: 1: 83 +T: prioeq: 2: 83 + +# T4 lock L3 +C: lockintnowait: 4: 3 +W: blocked: 4: 3 +T: prioeq: 0: 84 +T: prioeq: 1: 84 +T: prioeq: 2: 84 +T: prioeq: 3: 84 + +# Signal T4 +C: signal: 4: 0 +W: unlocked: 4: 3 +T: prioeq: 0: 83 +T: prioeq: 1: 83 +T: prioeq: 2: 83 +T: prioeq: 3: 83 + +# Signal T3 +C: signal: 3: 0 +W: unlocked: 3: 2 +T: prioeq: 0: 82 +T: prioeq: 1: 82 +T: prioeq: 2: 82 + +# Signal T2 +C: signal: 2: 0 +W: unlocked: 2: 1 +T: prioeq: 0: 81 +T: prioeq: 1: 81 + +# Signal T1 +C: signal: 1: 0 +W: unlocked: 1: 0 +T: priolt: 0: 1 + +# Unlock and exit +C: unlock: 3: 3 +C: unlock: 2: 2 +C: unlock: 1: 1 +C: unlock: 0: 0 + +W: unlocked: 3: 3 +W: unlocked: 2: 2 +W: unlocked: 1: 1 +W: unlocked: 0: 0 + diff --git a/security/keys/key.c b/security/keys/key.c index 43295ca37b5..80de8c3e9cc 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> +#include <linux/poison.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> @@ -988,7 +989,7 @@ void unregister_key_type(struct key_type *ktype) if (key->type == ktype) { if (ktype->destroy) ktype->destroy(key); - memset(&key->payload, 0xbd, sizeof(key->payload)); + memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); } } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ac7f2b2e392..28832e68980 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1532,8 +1532,9 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm) /* Default to the current task SID. */ bsec->sid = tsec->sid; - /* Reset create and sockcreate SID on execve. */ + /* Reset fs, key, and sock SIDs on execve. */ tsec->create_sid = 0; + tsec->keycreate_sid = 0; tsec->sockcreate_sid = 0; if (tsec->exec_sid) { @@ -2586,9 +2587,10 @@ static int selinux_task_alloc_security(struct task_struct *tsk) tsec2->osid = tsec1->osid; tsec2->sid = tsec1->sid; - /* Retain the exec, create, and sock SIDs across fork */ + /* Retain the exec, fs, key, and sock SIDs across fork */ tsec2->exec_sid = tsec1->exec_sid; tsec2->create_sid = tsec1->create_sid; + tsec2->keycreate_sid = tsec1->keycreate_sid; tsec2->sockcreate_sid = tsec1->sockcreate_sid; /* Retain ptracer SID across fork, if any. diff --git a/sound/oss/via82cxxx_audio.c b/sound/oss/via82cxxx_audio.c index 1a921ee71ab..2343dedd44a 100644 --- a/sound/oss/via82cxxx_audio.c +++ b/sound/oss/via82cxxx_audio.c @@ -24,6 +24,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/pci.h> +#include <linux/poison.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> @@ -3522,7 +3523,7 @@ err_out_have_mixer: err_out_kfree: #ifndef VIA_NDEBUG - memset (card, 0xAB, sizeof (*card)); /* poison memory */ + memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */ #endif kfree (card); @@ -3559,7 +3560,7 @@ static void __devexit via_remove_one (struct pci_dev *pdev) via_ac97_cleanup (card); #ifndef VIA_NDEBUG - memset (card, 0xAB, sizeof (*card)); /* poison memory */ + memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */ #endif kfree (card); |