diff options
376 files changed, 12192 insertions, 4941 deletions
diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt new file mode 100644 index 00000000000..bb775fbe43d --- /dev/null +++ b/Documentation/controllers/cpuacct.txt @@ -0,0 +1,32 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mkdir /cgroups +# mount -t cgroup -ocpuacct none /cgroups + +With the above step, the initial or the parent accounting group +becomes visible at /cgroups. At bootup, this group includes all the +tasks in the system. /cgroups/tasks lists the tasks in this cgroup. +/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by +this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /cgroups. + +# cd /cgroups +# mkdir g1 +# echo $$ > g1 + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/cgroups/cpuacct.usage also. diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 9cc4d685dde..803b1318b13 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files: tracer is not adding more data, they will display the same information every time they are read. - iter_ctrl: This file lets the user control the amount of data + trace_options: This file lets the user control the amount of data that is displayed in one of the above output files. @@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files: only be recorded if the latency is greater than the value in this file. (in microseconds) - trace_entries: This sets or displays the number of bytes each CPU + buffer_size_kb: This sets or displays the number of kilobytes each CPU buffer can hold. The tracer buffers are the same size for each CPU. The displayed number is the size of the - CPU buffer and not total size of all buffers. The + CPU buffer and not total size of all buffers. The trace buffers are allocated in pages (blocks of memory that the kernel uses for allocation, usually 4 KB in size). If the last page allocated has room for more bytes @@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files: be traced. If a function exists in both set_ftrace_filter and set_ftrace_notrace, the function will _not_ be traced. + set_ftrace_pid: Have the function tracer only trace a single thread. + available_filter_functions: This lists the functions that ftrace has processed and can trace. These are the function names that you can pass to "set_ftrace_filter" or @@ -316,23 +318,23 @@ The above is mostly meaningful for kernel developers. The rest is the same as the 'trace' file. -iter_ctrl ---------- +trace_options +------------- -The iter_ctrl file is used to control what gets printed in the trace +The trace_options file is used to control what gets printed in the trace output. To see what is available, simply cat the file: - cat /debug/tracing/iter_ctrl + cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree + noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj To disable one of the options, echo in the option prepended with "no". - echo noprint-parent > /debug/tracing/iter_ctrl + echo noprint-parent > /debug/tracing/trace_options To enable an option, leave off the "no". - echo sym-offset > /debug/tracing/iter_ctrl + echo sym-offset > /debug/tracing/trace_options Here are the available options: @@ -378,6 +380,20 @@ Here are the available options: When a trace is recorded, so is the stack of functions. This allows for back traces of trace sites. + userstacktrace - This option changes the trace. + It records a stacktrace of the current userspace thread. + + sym-userobj - when user stacktrace are enabled, look up which object the + address belongs to, and print a relative address + This is especially useful when ASLR is on, otherwise you don't + get a chance to resolve the address to object/file/line after the app is no + longer running + + The lookup is performed when you read trace,trace_pipe,latency_trace. Example: + + a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 +x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] + sched-tree - TBD (any users??) @@ -1059,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else, a search through /proc/mounts may be needed to find where the debugfs file-system is mounted. + +Single thread tracing +--------------------- + +By writing into /debug/tracing/set_ftrace_pid you can trace a +single thread. For example: + +# cat /debug/tracing/set_ftrace_pid +no pid +# echo 3111 > /debug/tracing/set_ftrace_pid +# cat /debug/tracing/set_ftrace_pid +3111 +# echo function > /debug/tracing/current_tracer +# cat /debug/tracing/trace | head + # tracer: function + # + # TASK-PID CPU# TIMESTAMP FUNCTION + # | | | | | + yum-updatesd-3111 [003] 1637.254676: finish_task_switch <-thread_return + yum-updatesd-3111 [003] 1637.254681: hrtimer_cancel <-schedule_hrtimeout_range + yum-updatesd-3111 [003] 1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel + yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel + yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll + yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll +# echo -1 > /debug/tracing/set_ftrace_pid +# cat /debug/tracing/trace |head + # tracer: function + # + # TASK-PID CPU# TIMESTAMP FUNCTION + # | | | | | + ##### CPU 3 buffer started #### + yum-updatesd-3111 [003] 1701.957688: free_poll_entry <-poll_freewait + yum-updatesd-3111 [003] 1701.957689: remove_wait_queue <-free_poll_entry + yum-updatesd-3111 [003] 1701.957691: fput <-free_poll_entry + yum-updatesd-3111 [003] 1701.957692: audit_syscall_exit <-sysret_audit + yum-updatesd-3111 [003] 1701.957693: path_put <-audit_syscall_exit + +If you want to trace a function when executing, you could use +something like this simple program: + +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +int main (int argc, char **argv) +{ + if (argc < 1) + exit(-1); + + if (fork() > 0) { + int fd, ffd; + char line[64]; + int s; + + ffd = open("/debug/tracing/current_tracer", O_WRONLY); + if (ffd < 0) + exit(-1); + write(ffd, "nop", 3); + + fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY); + s = sprintf(line, "%d\n", getpid()); + write(fd, line, s); + + write(ffd, "function", 8); + + close(fd); + close(ffd); + + execvp(argv[1], argv+1); + } + + return 0; +} + dynamic ftrace -------------- @@ -1158,7 +1251,11 @@ These are the only wild cards which are supported. <match>*<match> will not work. - # echo hrtimer_* > /debug/tracing/set_ftrace_filter +Note: It is better to use quotes to enclose the wild cards, otherwise + the shell may expand the parameters into names of files in the local + directory. + + # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter Produces: @@ -1213,7 +1310,7 @@ Again, now we want to append. # echo sys_nanosleep > /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter sys_nanosleep - # echo hrtimer_* >> /debug/tracing/set_ftrace_filter + # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter hrtimer_run_queues hrtimer_run_pending @@ -1299,41 +1396,29 @@ trace entries ------------- Having too much or not enough data can be troublesome in diagnosing -an issue in the kernel. The file trace_entries is used to modify +an issue in the kernel. The file buffer_size_kb is used to modify the size of the internal trace buffers. The number listed is the number of entries that can be recorded per CPU. To know the full size, multiply the number of possible CPUS with the number of entries. - # cat /debug/tracing/trace_entries -65620 + # cat /debug/tracing/buffer_size_kb +1408 (units kilobytes) Note, to modify this, you must have tracing completely disabled. To do that, echo "nop" into the current_tracer. If the current_tracer is not set to "nop", an EINVAL error will be returned. # echo nop > /debug/tracing/current_tracer - # echo 100000 > /debug/tracing/trace_entries - # cat /debug/tracing/trace_entries -100045 - - -Notice that we echoed in 100,000 but the size is 100,045. The entries -are held in individual pages. It allocates the number of pages it takes -to fulfill the request. If more entries may fit on the last page -then they will be added. - - # echo 1 > /debug/tracing/trace_entries - # cat /debug/tracing/trace_entries -85 - -This shows us that 85 entries can fit in a single page. + # echo 10000 > /debug/tracing/buffer_size_kb + # cat /debug/tracing/buffer_size_kb +10000 (units kilobytes) The number of pages which will be allocated is limited to a percentage of available memory. Allocating too much will produce an error. - # echo 1000000000000 > /debug/tracing/trace_entries + # echo 1000000000000 > /debug/tracing/buffer_size_kb -bash: echo: write error: Cannot allocate memory - # cat /debug/tracing/trace_entries + # cat /debug/tracing/buffer_size_kb 85 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e0f346d201e..2919a2e9193 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -750,6 +750,14 @@ and is between 256 and 4096 characters. It is defined in the file parameter will force ia64_sal_cache_flush to call ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. + ftrace=[tracer] + [ftrace] will set and start the specified tracer + as early as possible in order to facilitate early + boot debugging. + + ftrace_dump_on_oops + [ftrace] will dump the trace buffers on oops. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 4ba4664ce5c..9cb9138f7a7 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -71,35 +71,50 @@ Look at the current lock statistics: # less /proc/lock_stat -01 lock_stat version 0.2 +01 lock_stat version 0.3 02 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 03 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total 04 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 05 -06 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60 -07 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38 -08 -------------------------- -09 &inode->i_data.tree_lock 0 [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190 -10 -11 ............................................................................................................................................................................................... -12 -13 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 -14 ----------- -15 dcache_lock 180 [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230 -16 dcache_lock 165 [<ffffffff802c002a>] d_alloc+0x15a/0x210 -17 dcache_lock 33 [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70 -18 dcache_lock 1 [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130 +06 &mm->mmap_sem-W: 233 538 18446744073708 22924.27 607243.51 1342 45806 1.71 8595.89 1180582.34 +07 &mm->mmap_sem-R: 205 587 18446744073708 28403.36 731975.00 1940 412426 0.58 187825.45 6307502.88 +08 --------------- +09 &mm->mmap_sem 487 [<ffffffff8053491f>] do_page_fault+0x466/0x928 +10 &mm->mmap_sem 179 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d +11 &mm->mmap_sem 279 [<ffffffff80210a57>] sys_mmap+0x75/0xce +12 &mm->mmap_sem 76 [<ffffffff802a490b>] sys_munmap+0x32/0x59 +13 --------------- +14 &mm->mmap_sem 270 [<ffffffff80210a57>] sys_mmap+0x75/0xce +15 &mm->mmap_sem 431 [<ffffffff8053491f>] do_page_fault+0x466/0x928 +16 &mm->mmap_sem 138 [<ffffffff802a490b>] sys_munmap+0x32/0x59 +17 &mm->mmap_sem 145 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d +18 +19 ............................................................................................................................................................................................... +20 +21 dcache_lock: 621 623 0.52 118.26 1053.02 6745 91930 0.29 316.29 118423.41 +22 ----------- +23 dcache_lock 179 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54 +24 dcache_lock 113 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb +25 dcache_lock 99 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44 +26 dcache_lock 104 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a +27 ----------- +28 dcache_lock 192 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54 +29 dcache_lock 98 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44 +30 dcache_lock 72 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb +31 dcache_lock 112 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a This excerpt shows the first two lock class statistics. Line 01 shows the output version - each time the format changes this will be updated. Line 02-04 -show the header with column descriptions. Lines 05-10 and 13-18 show the actual +show the header with column descriptions. Lines 05-18 and 20-31 show the actual statistics. These statistics come in two parts; the actual stats separated by a -short separator (line 08, 14) from the contention points. +short separator (line 08, 13) from the contention points. -The first lock (05-10) is a read/write lock, and shows two lines above the +The first lock (05-18) is a read/write lock, and shows two lines above the short separator. The contention points don't match the column descriptors, -they have two: contentions and [<IP>] symbol. +they have two: contentions and [<IP>] symbol. The second set of contention +points are the points we're contending with. +The integer part of the time values is in us. View the top contending locks: diff --git a/Documentation/markers.txt b/Documentation/markers.txt index 089f6138fcd..d2b3d0e91b2 100644 --- a/Documentation/markers.txt +++ b/Documentation/markers.txt @@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be activated by calling marker_arm(). Marker deactivation can be done by calling marker_disarm() as many times as marker_arm() has been called. Removing a probe is done through marker_probe_unregister(); it will disarm the probe. -marker_synchronize_unregister() must be called before the end of the module exit -function to make sure there is no caller left using the probe. This, and the -fact that preemption is disabled around the probe call, make sure that probe -removal and module unload are safe. See the "Probe example" section below for a -sample probe module. + +marker_synchronize_unregister() must be called between probe unregistration and +the first occurrence of +- the end of module exit function, + to make sure there is no caller left using the probe; +- the free of any resource used by the probes, + to make sure the probes wont be accessing invalid data. +This, and the fact that preemption is disabled around the probe call, make sure +that probe removal and module unload are safe. See the "Probe example" section +below for a sample probe module. The marker mechanism supports inserting multiple instances of the same marker. Markers can be put in inline functions, inlined static functions, and @@ -70,6 +75,20 @@ a printk warning which identifies the inconsistency: "Format mismatch for probe probe_name (format), marker (format)" +Another way to use markers is to simply define the marker without generating any +function call to actually call into the marker. This is useful in combination +with tracepoint probes in a scheme like this : + +void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk); + +DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name, + "arg1 %u pid %d"); + +notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk) +{ + struct marker *marker = &GET_MARKER(kernel_irq_entry); + /* write data to trace buffers ... */ +} * Probe / marker example diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 941615a9769..d43dbcbd163 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt @@ -8,7 +8,7 @@ Context switch By default, the switch_to arch function is called with the runqueue locked. This is usually not a problem unless switch_to may need to take the runqueue lock. This is usually due to a wake up operation in -the context switch. See include/asm-ia64/system.h for an example. +the context switch. See arch/ia64/include/asm/system.h for an example. To request the scheduler call switch_to with the runqueue unlocked, you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file @@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to introduce a significant interrupt latency by adding the line `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for unlocked context switches. This define also implies -`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an +`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an example. diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 5d354e16749..6f0a044f5b5 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -3,28 +3,30 @@ Mathieu Desnoyers -This document introduces Linux Kernel Tracepoints and their use. It provides -examples of how to insert tracepoints in the kernel and connect probe functions -to them and provides some examples of probe functions. +This document introduces Linux Kernel Tracepoints and their use. It +provides examples of how to insert tracepoints in the kernel and +connect probe functions to them and provides some examples of probe +functions. * Purpose of tracepoints -A tracepoint placed in code provides a hook to call a function (probe) that you -can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or -"off" (no probe is attached). When a tracepoint is "off" it has no effect, -except for adding a tiny time penalty (checking a condition for a branch) and -space penalty (adding a few bytes for the function call at the end of the -instrumented function and adds a data structure in a separate section). When a -tracepoint is "on", the function you provide is called each time the tracepoint -is executed, in the execution context of the caller. When the function provided -ends its execution, it returns to the caller (continuing from the tracepoint -site). +A tracepoint placed in code provides a hook to call a function (probe) +that you can provide at runtime. A tracepoint can be "on" (a probe is +connected to it) or "off" (no probe is attached). When a tracepoint is +"off" it has no effect, except for adding a tiny time penalty +(checking a condition for a branch) and space penalty (adding a few +bytes for the function call at the end of the instrumented function +and adds a data structure in a separate section). When a tracepoint +is "on", the function you provide is called each time the tracepoint +is executed, in the execution context of the caller. When the function +provided ends its execution, it returns to the caller (continuing from +the tracepoint site). You can put tracepoints at important locations in the code. They are lightweight hooks that can pass an arbitrary number of parameters, -which prototypes are described in a tracepoint declaration placed in a header -file. +which prototypes are described in a tracepoint declaration placed in a +header file. They can be used for tracing and performance accounting. @@ -42,14 +44,16 @@ In include/trace/subsys.h : #include <linux/tracepoint.h> -DEFINE_TRACE(subsys_eventname, - TPPTOTO(int firstarg, struct task_struct *p), +DECLARE_TRACE(subsys_eventname, + TPPROTO(int firstarg, struct task_struct *p), TPARGS(firstarg, p)); In subsys/file.c (where the tracing statement must be added) : #include <trace/subsys.h> +DEFINE_TRACE(subsys_eventname); + void somefct(void) { ... @@ -61,31 +65,41 @@ Where : - subsys_eventname is an identifier unique to your event - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function - called by this tracepoint. -- TPARGS(firstarg, p) are the parameters names, same as found in the prototype. -Connecting a function (probe) to a tracepoint is done by providing a probe -(function to call) for the specific tracepoint through -register_trace_subsys_eventname(). Removing a probe is done through -unregister_trace_subsys_eventname(); it will remove the probe sure there is no -caller left using the probe when it returns. Probe removal is preempt-safe -because preemption is disabled around the probe call. See the "Probe example" -section below for a sample probe module. - -The tracepoint mechanism supports inserting multiple instances of the same -tracepoint, but a single definition must be made of a given tracepoint name over -all the kernel to make sure no type conflict will occur. Name mangling of the -tracepoints is done using the prototypes to make sure typing is correct. -Verification of probe type correctness is done at the registration site by the -compiler. Tracepoints can be put in inline functions, inlined static functions, -and unrolled loops as well as regular functions. - -The naming scheme "subsys_event" is suggested here as a convention intended -to limit collisions. Tracepoint names are global to the kernel: they are -considered as being the same whether they are in the core kernel image or in -modules. +- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the + function called by this tracepoint. +- TPARGS(firstarg, p) are the parameters names, same as found in the + prototype. + +Connecting a function (probe) to a tracepoint is done by providing a +probe (function to call) for the specific tracepoint through +register_trace_subsys_eventname(). Removing a probe is done through +unregister_trace_subsys_eventname(); it will remove the probe. + +tracepoint_synchronize_unregister() must be called before the end of +the module exit function to make sure there is no caller left using +the probe. This, and the fact that preemption is disabled around the +probe call, make sure that probe removal and module unload are safe. +See the "Probe example" section below for a sample probe module. + +The tracepoint mechanism supports inserting multiple instances of the +same tracepoint, but a single definition must be made of a given +tracepoint name over all the kernel to make sure no type conflict will +occur. Name mangling of the tracepoints is done using the prototypes +to make sure typing is correct. Verification of probe type correctness +is done at the registration site by the compiler. Tracepoints can be +put in inline functions, inlined static functions, and unrolled loops +as well as regular functions. + +The naming scheme "subsys_event" is suggested here as a convention +intended to limit collisions. Tracepoint names are global to the +kernel: they are considered as being the same whether they are in the +core kernel image or in modules. + +If the tracepoint has to be used in kernel modules, an +EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be +used to export the defined tracepoints. * Probe / tracepoint example diff --git a/arch/alpha/include/asm/smp.h b/arch/alpha/include/asm/smp.h index 544c69af816..547e90951ce 100644 --- a/arch/alpha/include/asm/smp.h +++ b/arch/alpha/include/asm/smp.h @@ -45,7 +45,6 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS]; #define raw_smp_processor_id() (current_thread_info()->cpu) extern int smp_num_cpus; -#define cpu_possible_map cpu_present_map extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi(cpumask_t mask); diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index c626a821cdc..d0f1620007f 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq) last_cpu = cpu; irq_desc[irq].affinity = cpumask_of_cpu(cpu); - irq_desc[irq].chip->set_affinity(irq, cpumask_of_cpu(cpu)); + irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu)); return 0; } #endif /* CONFIG_SMP */ diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 351407e07e7..f238370c907 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -94,6 +94,7 @@ common_shutdown_1(void *generic_ptr) flags |= 0x00040000UL; /* "remain halted" */ *pflags = flags; cpu_clear(cpuid, cpu_present_map); + cpu_clear(cpuid, cpu_possible_map); halt(); } #endif @@ -120,6 +121,7 @@ common_shutdown_1(void *generic_ptr) #ifdef CONFIG_SMP /* Wait for the secondaries to halt. */ cpu_clear(boot_cpuid, cpu_present_map); + cpu_clear(boot_cpuid, cpu_possible_map); while (cpus_weight(cpu_present_map)) barrier(); #endif diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index cf7da10097b..d953e510f68 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -70,11 +70,6 @@ enum ipi_message_type { /* Set to a secondary's cpuid when it comes online. */ static int smp_secondary_alive __devinitdata = 0; -/* Which cpus ids came online. */ -cpumask_t cpu_online_map; - -EXPORT_SYMBOL(cpu_online_map); - int smp_num_probed; /* Internal processor count */ int smp_num_cpus = 1; /* Number that came online. */ EXPORT_SYMBOL(smp_num_cpus); @@ -440,6 +435,7 @@ setup_smp(void) ((char *)cpubase + i*hwrpb->processor_size); if ((cpu->flags & 0x1cc) == 0x1cc) { smp_num_probed++; + cpu_set(i, cpu_possible_map); cpu_set(i, cpu_present_map); cpu->pal_revision = boot_cpu_palrev; } @@ -473,6 +469,7 @@ smp_prepare_cpus(unsigned int max_cpus) /* Nothing to do on a UP box, or when told not to. */ if (smp_num_probed == 1 || max_cpus == 0) { + cpu_possible_map = cpumask_of_cpu(boot_cpuid); cpu_present_map = cpumask_of_cpu(boot_cpuid); printk(KERN_INFO "SMP mode deactivated.\n"); return; diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index c71b0fd7a61..ab44c164d9d 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -177,19 +177,19 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) } static void -dp264_set_affinity(unsigned int irq, cpumask_t affinity) +dp264_set_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&dp264_irq_lock); - cpu_set_irq_affinity(irq, affinity); + cpu_set_irq_affinity(irq, *affinity); tsunami_update_irq_hw(cached_irq_mask); spin_unlock(&dp264_irq_lock); } static void -clipper_set_affinity(unsigned int irq, cpumask_t affinity) +clipper_set_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&dp264_irq_lock); - cpu_set_irq_affinity(irq - 16, affinity); + cpu_set_irq_affinity(irq - 16, *affinity); tsunami_update_irq_hw(cached_irq_mask); spin_unlock(&dp264_irq_lock); } diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index 52c91ccc164..27f840a4ad3 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -158,10 +158,10 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) } static void -titan_set_irq_affinity(unsigned int irq, cpumask_t affinity) +titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&titan_irq_lock); - titan_cpu_set_irq_affinity(irq - 16, affinity); + titan_cpu_set_irq_affinity(irq - 16, *affinity); titan_update_irq_hw(titan_cached_irq_mask); spin_unlock(&titan_irq_lock); } diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c index 7fc9860a97d..c6884ba1d5e 100644 --- a/arch/arm/common/gic.c +++ b/arch/arm/common/gic.c @@ -109,11 +109,11 @@ static void gic_unmask_irq(unsigned int irq) } #ifdef CONFIG_SMP -static void gic_set_cpu(unsigned int irq, cpumask_t mask_val) +static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) { void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3); unsigned int shift = (irq % 4) * 8; - unsigned int cpu = first_cpu(mask_val); + unsigned int cpu = cpumask_first(mask_val); u32 val; spin_lock(&irq_controller_lock); diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index 2f3eb795fa6..7141cee1fab 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -174,7 +174,7 @@ static void route_irq(struct irq_desc *desc, unsigned int irq, unsigned int cpu) pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu); spin_lock_irq(&desc->lock); - desc->chip->set_affinity(irq, cpumask_of_cpu(cpu)); + desc->chip->set_affinity(irq, cpumask_of(cpu)); spin_unlock_irq(&desc->lock); } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e42a749a56d..bd905c0a736 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -34,16 +34,6 @@ #include <asm/ptrace.h> /* - * bitmask of present and online CPUs. - * The present bitmask indicates that the CPU is physically present. - * The online bitmask indicates that the CPU is up and running. - */ -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); -cpumask_t cpu_online_map; -EXPORT_SYMBOL(cpu_online_map); - -/* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core * where to place its SVC stack diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c index a72e798a2a4..72f51d39202 100644 --- a/arch/arm/mach-at91/at91rm9200_time.c +++ b/arch/arm/mach-at91/at91rm9200_time.c @@ -169,7 +169,6 @@ static struct clock_event_device clkevt = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .shift = 32, .rating = 150, - .cpumask = CPU_MASK_CPU0, .set_next_event = clkevt32k_next_event, .set_mode = clkevt32k_mode, }; @@ -197,7 +196,7 @@ void __init at91rm9200_timer_init(void) clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift); clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt); clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1; - clkevt.cpumask = cpumask_of_cpu(0); + clkevt.cpumask = cpumask_of(0); clockevents_register_device(&clkevt); /* register clocksource */ diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c index 122fd77ed58..b63e1d5f1ba 100644 --- a/arch/arm/mach-at91/at91sam926x_time.c +++ b/arch/arm/mach-at91/at91sam926x_time.c @@ -91,7 +91,6 @@ static struct clock_event_device pit_clkevt = { .features = CLOCK_EVT_FEAT_PERIODIC, .shift = 32, .rating = 100, - .cpumask = CPU_MASK_CPU0, .set_mode = pit_clkevt_mode, }; @@ -173,6 +172,7 @@ static void __init at91sam926x_pit_init(void) /* Set up and register clockevents */ pit_clkevt.mult = div_sc(pit_rate, NSEC_PER_SEC, pit_clkevt.shift); + pit_clkevt.cpumask = cpumask_of(0); clockevents_register_device(&pit_clkevt); } diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c index 3b9a296b5c4..f8bcd29d17a 100644 --- a/arch/arm/mach-davinci/time.c +++ b/arch/arm/mach-davinci/time.c @@ -322,7 +322,7 @@ static void __init davinci_timer_init(void) clockevent_davinci.min_delta_ns = clockevent_delta2ns(1, &clockevent_davinci); - clockevent_davinci.cpumask = cpumask_of_cpu(0); + clockevent_davinci.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_davinci); } diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c index a11765f5f23..aff0ebcfa84 100644 --- a/arch/arm/mach-imx/time.c +++ b/arch/arm/mach-imx/time.c @@ -184,7 +184,7 @@ static int __init imx_clockevent_init(unsigned long rate) clockevent_imx.min_delta_ns = clockevent_delta2ns(0xf, &clockevent_imx); - clockevent_imx.cpumask = cpumask_of_cpu(0); + clockevent_imx.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_imx); diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 7766f469456..f4656d2ac8a 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -487,7 +487,7 @@ static int __init ixp4xx_clockevent_init(void) clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx); clockevent_ixp4xx.min_delta_ns = clockevent_delta2ns(0xf, &clockevent_ixp4xx); - clockevent_ixp4xx.cpumask = cpumask_of_cpu(0); + clockevent_ixp4xx.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_ixp4xx); return 0; diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c index 345a14cb73c..444d9c0f5ca 100644 --- a/arch/arm/mach-msm/timer.c +++ b/arch/arm/mach-msm/timer.c @@ -182,7 +182,7 @@ static void __init msm_timer_init(void) clockevent_delta2ns(0xf0000000 >> clock->shift, ce); /* 4 gets rounded down to 3 */ ce->min_delta_ns = clockevent_delta2ns(4, ce); - ce->cpumask = cpumask_of_cpu(0); + ce->cpumask = cpumask_of(0); cs->mult = clocksource_hz2mult(clock->freq, cs->shift); res = clocksource_register(cs); diff --git a/arch/arm/mach-ns9xxx/time-ns9360.c b/arch/arm/mach-ns9xxx/time-ns9360.c index a63424d083d..41df6972176 100644 --- a/arch/arm/mach-ns9xxx/time-ns9360.c +++ b/arch/arm/mach-ns9xxx/time-ns9360.c @@ -173,7 +173,7 @@ static void __init ns9360_timer_init(void) ns9360_clockevent_device.min_delta_ns = clockevent_delta2ns(1, &ns9360_clockevent_device); - ns9360_clockevent_device.cpumask = cpumask_of_cpu(0); + ns9360_clockevent_device.cpumask = cpumask_of(0); clockevents_register_device(&ns9360_clockevent_device); setup_irq(IRQ_NS9360_TIMER0 + TIMER_CLOCKEVENT, diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c index 2cf7e32bd29..495a32c287b 100644 --- a/arch/arm/mach-omap1/time.c +++ b/arch/arm/mach-omap1/time.c @@ -173,7 +173,7 @@ static __init void omap_init_mpu_timer(unsigned long rate) clockevent_mpu_timer1.min_delta_ns = clockevent_delta2ns(1, &clockevent_mpu_timer1); - clockevent_mpu_timer1.cpumask = cpumask_of_cpu(0); + clockevent_mpu_timer1.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_mpu_timer1); } diff --git a/arch/arm/mach-omap1/timer32k.c b/arch/arm/mach-omap1/timer32k.c index 705367ece17..fd3f7396e16 100644 --- a/arch/arm/mach-omap1/timer32k.c +++ b/arch/arm/mach-omap1/timer32k.c @@ -187,7 +187,7 @@ static __init void omap_init_32k_timer(void) clockevent_32k_timer.min_delta_ns = clockevent_delta2ns(1, &clockevent_32k_timer); - clockevent_32k_timer.cpumask = cpumask_of_cpu(0); + clockevent_32k_timer.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_32k_timer); } diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c index 589393bedad..ae6036300f6 100644 --- a/arch/arm/mach-omap2/timer-gp.c +++ b/arch/arm/mach-omap2/timer-gp.c @@ -120,7 +120,7 @@ static void __init omap2_gp_clockevent_init(void) clockevent_gpt.min_delta_ns = clockevent_delta2ns(1, &clockevent_gpt); - clockevent_gpt.cpumask = cpumask_of_cpu(0); + clockevent_gpt.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_gpt); } diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c index f8a9a62959e..bf3c9a4aad5 100644 --- a/arch/arm/mach-pxa/time.c +++ b/arch/arm/mach-pxa/time.c @@ -122,7 +122,6 @@ static struct clock_event_device ckevt_pxa_osmr0 = { .features = CLOCK_EVT_FEAT_ONESHOT, .shift = 32, .rating = 200, - .cpumask = CPU_MASK_CPU0, .set_next_event = pxa_osmr0_set_next_event, .set_mode = pxa_osmr0_set_mode, }; @@ -170,6 +169,7 @@ static void __init pxa_timer_init(void) clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0); ckevt_pxa_osmr0.min_delta_ns = clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1; + ckevt_pxa_osmr0.cpumask = cpumask_of(0); cksrc_pxa_oscr0.mult = clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift); diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c index 2f04d54711e..b07cb9b7adb 100644 --- a/arch/arm/mach-realview/core.c +++ b/arch/arm/mach-realview/core.c @@ -511,7 +511,7 @@ static struct clock_event_device timer0_clockevent = { .set_mode = timer_set_mode, .set_next_event = timer_set_next_event, .rating = 300, - .cpumask = CPU_MASK_ALL, + .cpumask = cpu_all_mask, }; static void __init realview_clockevents_init(unsigned int timer_irq) diff --git a/arch/arm/mach-realview/localtimer.c b/arch/arm/mach-realview/localtimer.c index 44d178cd573..504961ef343 100644 --- a/arch/arm/mach-realview/localtimer.c +++ b/arch/arm/mach-realview/localtimer.c @@ -161,7 +161,7 @@ void __cpuinit local_timer_setup(unsigned int cpu) clk->set_mode = local_timer_set_mode; clk->set_next_event = local_timer_set_next_event; clk->irq = IRQ_LOCALTIMER; - clk->cpumask = cpumask_of_cpu(cpu); + clk->cpumask = cpumask_of(cpu); clk->shift = 20; clk->mult = div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift); clk->max_delta_ns = clockevent_delta2ns(0xffffffff, clk); @@ -199,7 +199,7 @@ void __cpuinit local_timer_setup(unsigned int cpu) clk->rating = 200; clk->set_mode = dummy_timer_set_mode; clk->broadcast = smp_timer_broadcast; - clk->cpumask = cpumask_of_cpu(cpu); + clk->cpumask = cpumask_of(cpu); clockevents_register_device(clk); } diff --git a/arch/arm/mach-sa1100/time.c b/arch/arm/mach-sa1100/time.c index 24c0a4bae85..1cac4ac0b4b 100644 --- a/arch/arm/mach-sa1100/time.c +++ b/arch/arm/mach-sa1100/time.c @@ -73,7 +73,6 @@ static struct clock_event_device ckevt_sa1100_osmr0 = { .features = CLOCK_EVT_FEAT_ONESHOT, .shift = 32, .rating = 200, - .cpumask = CPU_MASK_CPU0, .set_next_event = sa1100_osmr0_set_next_event, .set_mode = sa1100_osmr0_set_mode, }; @@ -110,6 +109,7 @@ static void __init sa1100_timer_init(void) clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0); ckevt_sa1100_osmr0.min_delta_ns = clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1; + ckevt_sa1100_osmr0.cpumask = cpumask_of(0); cksrc_sa1100_oscr.mult = clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift); diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c index 565e0ba0d67..a3f1933434e 100644 --- a/arch/arm/mach-versatile/core.c +++ b/arch/arm/mach-versatile/core.c @@ -965,7 +965,7 @@ static void __init versatile_timer_init(void) timer0_clockevent.min_delta_ns = clockevent_delta2ns(0xf, &timer0_clockevent); - timer0_clockevent.cpumask = cpumask_of_cpu(0); + timer0_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&timer0_clockevent); } diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c index 4de366e8b4c..6d6bd589924 100644 --- a/arch/arm/oprofile/op_model_mpcore.c +++ b/arch/arm/oprofile/op_model_mpcore.c @@ -260,10 +260,10 @@ static void em_stop(void) static void em_route_irq(int irq, unsigned int cpu) { struct irq_desc *desc = irq_desc + irq; - cpumask_t mask = cpumask_of_cpu(cpu); + const struct cpumask *mask = cpumask_of(cpu); spin_lock_irq(&desc->lock); - desc->affinity = mask; + desc->affinity = *mask; desc->chip->set_affinity(irq, mask); spin_unlock_irq(&desc->lock); } diff --git a/arch/arm/plat-mxc/time.c b/arch/arm/plat-mxc/time.c index fd28f5194f7..758a1293bcf 100644 --- a/arch/arm/plat-mxc/time.c +++ b/arch/arm/plat-mxc/time.c @@ -190,7 +190,7 @@ static int __init mxc_clockevent_init(void) clockevent_mxc.min_delta_ns = clockevent_delta2ns(0xff, &clockevent_mxc); - clockevent_mxc.cpumask = cpumask_of_cpu(0); + clockevent_mxc.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_mxc); diff --git a/arch/arm/plat-orion/time.c b/arch/arm/plat-orion/time.c index 544d6b327f3..6fa2923e6dc 100644 --- a/arch/arm/plat-orion/time.c +++ b/arch/arm/plat-orion/time.c @@ -149,7 +149,6 @@ static struct clock_event_device orion_clkevt = { .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, .shift = 32, .rating = 300, - .cpumask = CPU_MASK_CPU0, .set_next_event = orion_clkevt_next_event, .set_mode = orion_clkevt_mode, }; @@ -199,5 +198,6 @@ void __init orion_time_init(unsigned int irq, unsigned int tclk) orion_clkevt.mult = div_sc(tclk, NSEC_PER_SEC, orion_clkevt.shift); orion_clkevt.max_delta_ns = clockevent_delta2ns(0xfffffffe, &orion_clkevt); orion_clkevt.min_delta_ns = clockevent_delta2ns(1, &orion_clkevt); + orion_clkevt.cpumask = cpumask_of(0); clockevents_register_device(&orion_clkevt); } diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c index 283481d74a5..0ff46bf873b 100644 --- a/arch/avr32/kernel/time.c +++ b/arch/avr32/kernel/time.c @@ -106,7 +106,6 @@ static struct clock_event_device comparator = { .features = CLOCK_EVT_FEAT_ONESHOT, .shift = 16, .rating = 50, - .cpumask = CPU_MASK_CPU0, .set_next_event = comparator_next_event, .set_mode = comparator_mode, }; @@ -134,6 +133,7 @@ void __init time_init(void) comparator.mult = div_sc(counter_hz, NSEC_PER_SEC, comparator.shift); comparator.max_delta_ns = clockevent_delta2ns((u32)~0, &comparator); comparator.min_delta_ns = clockevent_delta2ns(50, &comparator) + 1; + comparator.cpumask = cpumask_of(0); sysreg_write(COMPARE, 0); timer_irqaction.dev_id = &comparator; diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c index e887efc86c2..0ed2badfd74 100644 --- a/arch/blackfin/kernel/time-ts.c +++ b/arch/blackfin/kernel/time-ts.c @@ -162,7 +162,6 @@ static struct clock_event_device clockevent_bfin = { .name = "bfin_core_timer", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .shift = 32, - .cpumask = CPU_MASK_CPU0, .set_next_event = bfin_timer_set_next_event, .set_mode = bfin_timer_set_mode, }; @@ -193,6 +192,7 @@ static int __init bfin_clockevent_init(void) clockevent_bfin.mult = div_sc(timer_clk, NSEC_PER_SEC, clockevent_bfin.shift); clockevent_bfin.max_delta_ns = clockevent_delta2ns(-1, &clockevent_bfin); clockevent_bfin.min_delta_ns = clockevent_delta2ns(100, &clockevent_bfin); + clockevent_bfin.cpumask = cpumask_of(0); clockevents_register_device(&clockevent_bfin); return 0; diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c index 173c141ac9b..295131fee71 100644 --- a/arch/cris/arch-v32/kernel/irq.c +++ b/arch/cris/arch-v32/kernel/irq.c @@ -325,11 +325,11 @@ static void end_crisv32_irq(unsigned int irq) { } -void set_affinity_crisv32_irq(unsigned int irq, cpumask_t dest) +void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest) { unsigned long flags; spin_lock_irqsave(&irq_lock, flags); - irq_allocations[irq - FIRST_IRQ].mask = dest; + irq_allocations[irq - FIRST_IRQ].mask = *dest; spin_unlock_irqrestore(&irq_lock, flags); } diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 52e16c6436f..9dac1733464 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -29,11 +29,7 @@ spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED}; /* CPU masks */ -cpumask_t cpu_online_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_online_map); cpumask_t phys_cpu_present_map = CPU_MASK_NONE; -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); EXPORT_SYMBOL(phys_cpu_present_map); /* Variables used during SMP boot */ diff --git a/arch/cris/include/asm/smp.h b/arch/cris/include/asm/smp.h index dba33aba3e9..c615a06dd75 100644 --- a/arch/cris/include/asm/smp.h +++ b/arch/cris/include/asm/smp.h @@ -4,7 +4,6 @@ #include <linux/cpumask.h> extern cpumask_t phys_cpu_present_map; -extern cpumask_t cpu_possible_map; #define raw_smp_processor_id() (current_thread_info()->cpu) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 6bd91ed7cd0..7fa8f615ba6 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -99,7 +99,7 @@ config GENERIC_IOMAP bool default y -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c index c2f58ff364e..cc0a3182db3 100644 --- a/arch/ia64/hp/sim/hpsim_irq.c +++ b/arch/ia64/hp/sim/hpsim_irq.c @@ -22,7 +22,7 @@ hpsim_irq_noop (unsigned int irq) } static void -hpsim_set_affinity_noop (unsigned int a, cpumask_t b) +hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b) { } diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h index 12d96e0cd51..21c402365d0 100644 --- a/arch/ia64/include/asm/smp.h +++ b/arch/ia64/include/asm/smp.h @@ -57,7 +57,6 @@ extern struct smp_boot_data { extern char no_int_routing __devinitdata; -extern cpumask_t cpu_online_map; extern cpumask_t cpu_core_map[NR_CPUS]; DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); extern int smp_num_siblings; diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 35bcb641c9e..a3cc9f65f95 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -55,7 +55,6 @@ void build_cpu_to_node_map(void); #define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ @@ -80,7 +79,6 @@ void build_cpu_to_node_map(void); /* sched_domains SD_NODE_INIT for IA64 NUMA machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 5c4674ae8ae..c8adecd5b41 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -330,25 +330,25 @@ unmask_irq (unsigned int irq) static void -iosapic_set_affinity (unsigned int irq, cpumask_t mask) +iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) { #ifdef CONFIG_SMP u32 high32, low32; - int dest, rte_index; + int cpu, dest, rte_index; int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0; struct iosapic_rte_info *rte; struct iosapic *iosapic; irq &= (~IA64_IRQ_REDIRECTED); - cpus_and(mask, mask, cpu_online_map); - if (cpus_empty(mask)) + cpu = cpumask_first_and(cpu_online_mask, mask); + if (cpu >= nr_cpu_ids) return; - if (irq_prepare_move(irq, first_cpu(mask))) + if (irq_prepare_move(irq, cpu)) return; - dest = cpu_physical_id(first_cpu(mask)); + dest = cpu_physical_id(cpu); if (!iosapic_intr_info[irq].count) return; /* not an IOSAPIC interrupt */ diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c index 7fd18f54c05..0b6db53fedc 100644 --- a/arch/ia64/kernel/irq.c +++ b/arch/ia64/kernel/irq.c @@ -133,7 +133,6 @@ unsigned int vectors_in_migration[NR_IRQS]; */ static void migrate_irqs(void) { - cpumask_t mask; irq_desc_t *desc; int irq, new_cpu; @@ -152,15 +151,14 @@ static void migrate_irqs(void) if (desc->status == IRQ_PER_CPU) continue; - cpus_and(mask, irq_desc[irq].affinity, cpu_online_map); - if (any_online_cpu(mask) == NR_CPUS) { + if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask) + >= nr_cpu_ids) { /* * Save it for phase 2 processing */ vectors_in_migration[irq] = irq; new_cpu = any_online_cpu(cpu_online_map); - mask = cpumask_of_cpu(new_cpu); /* * Al three are essential, currently WARN_ON.. maybe panic? @@ -168,7 +166,8 @@ static void migrate_irqs(void) if (desc->chip && desc->chip->disable && desc->chip->enable && desc->chip->set_affinity) { desc->chip->disable(irq); - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, + cpumask_of(new_cpu)); desc->chip->enable(irq); } else { WARN_ON((!(desc->chip) || !(desc->chip->disable) || diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 702a09c1323..89033933903 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -49,11 +49,12 @@ static struct irq_chip ia64_msi_chip; #ifdef CONFIG_SMP -static void ia64_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask) +static void ia64_set_msi_irq_affinity(unsigned int irq, + const cpumask_t *cpu_mask) { struct msi_msg msg; u32 addr, data; - int cpu = first_cpu(cpu_mask); + int cpu = first_cpu(*cpu_mask); if (!cpu_online(cpu)) return; @@ -166,12 +167,11 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_cfg *cfg = irq_cfg + irq; struct msi_msg msg; - int cpu = first_cpu(mask); - + int cpu = cpumask_first(mask); if (!cpu_online(cpu)) return; @@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; + irq_desc[irq].affinity = *mask; } #endif /* CONFIG_SMP */ diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 1dcbb85fc4e..11463994a7d 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -131,12 +131,6 @@ struct task_struct *task_for_booting_cpu; */ DEFINE_PER_CPU(int, cpu_state); -/* Bitmasks of currently online, and possible CPUs */ -cpumask_t cpu_online_map; -EXPORT_SYMBOL(cpu_online_map); -cpumask_t cpu_possible_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_possible_map); - cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_core_map); DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map); @@ -688,7 +682,7 @@ int migrate_platform_irqs(unsigned int cpu) { int new_cpei_cpu; irq_desc_t *desc = NULL; - cpumask_t mask; + const struct cpumask *mask; int retval = 0; /* @@ -701,7 +695,7 @@ int migrate_platform_irqs(unsigned int cpu) * Now re-target the CPEI to a different processor */ new_cpei_cpu = any_online_cpu(cpu_online_map); - mask = cpumask_of_cpu(new_cpei_cpu); + mask = cpumask_of(new_cpei_cpu); set_cpei_target_cpu(new_cpei_cpu); desc = irq_desc + ia64_cpe_irq; /* diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index c75b914f2d6..a8d61a3e9a9 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -219,7 +219,7 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf) cpumask_t shared_cpu_map; cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map); - len = cpumask_scnprintf(buf, NR_CPUS+1, shared_cpu_map); + len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map); len += sprintf(buf+len, "\n"); return len; } diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index 0c66dbdd1d7..66fd705e82c 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -227,14 +227,14 @@ finish_up: return new_irq_info; } -static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask) +static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct sn_irq_info *sn_irq_info, *sn_irq_info_safe; nasid_t nasid; int slice; - nasid = cpuid_to_nasid(first_cpu(mask)); - slice = cpuid_to_slice(first_cpu(mask)); + nasid = cpuid_to_nasid(cpumask_first(mask)); + slice = cpuid_to_slice(cpumask_first(mask)); list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe, sn_irq_lh[irq], list) diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c index 83f190ffe35..ca553b0429c 100644 --- a/arch/ia64/sn/kernel/msi_sn.c +++ b/arch/ia64/sn/kernel/msi_sn.c @@ -151,7 +151,8 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry) } #ifdef CONFIG_SMP -static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask) +static void sn_set_msi_irq_affinity(unsigned int irq, + const struct cpumask *cpu_mask) { struct msi_msg msg; int slice; @@ -164,7 +165,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask) struct sn_pcibus_provider *provider; unsigned int cpu; - cpu = first_cpu(cpu_mask); + cpu = cpumask_first(cpu_mask); sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) return; @@ -204,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask) msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = cpu_mask; + irq_desc[irq].affinity = *cpu_mask; } #endif /* CONFIG_SMP */ diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index dbaed4a6381..cabba332cc4 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -10,6 +10,7 @@ config M32R default y select HAVE_IDE select HAVE_OPROFILE + select INIT_ALL_POSSIBLE config SBUS bool @@ -273,7 +274,7 @@ config GENERIC_CALIBRATE_DELAY bool default y -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index 39cb6da72dc..0f06b3722e9 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -73,17 +73,11 @@ static unsigned int bsp_phys_id = -1; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; -/* Bitmask of currently online CPUs */ -cpumask_t cpu_online_map; -EXPORT_SYMBOL(cpu_online_map); - cpumask_t cpu_bootout_map; cpumask_t cpu_bootin_map; static cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); -cpumask_t cpu_possible_map = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); /* Per CPU bogomips and other parameters */ struct cpuinfo_m32r cpu_data[NR_CPUS] __cacheline_aligned; diff --git a/arch/m68knommu/platform/coldfire/pit.c b/arch/m68knommu/platform/coldfire/pit.c index c5b916700b2..2a12e7fa974 100644 --- a/arch/m68knommu/platform/coldfire/pit.c +++ b/arch/m68knommu/platform/coldfire/pit.c @@ -156,7 +156,7 @@ void hw_timer_init(void) { u32 imr; - cf_pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id()); cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32); cf_pit_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFF, &cf_pit_clockevent); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index f4af967a6b3..a5255e7c79e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE bool default y -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index a58f0eecc68..abc62aa744a 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -49,7 +49,8 @@ static inline void smtc_im_ack_irq(unsigned int irq) #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF #include <linux/cpumask.h> -extern void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity); +extern void plat_set_irq_affinity(unsigned int irq, + const struct cpumask *affinity); extern void smtc_forward_irq(unsigned int irq); /* diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 7785bec732f..1fb959f9898 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -37,7 +37,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; /* sched_domains SD_NODE_INIT for SGI IP27 machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index 0ff5b523ea7..86557b5d1b3 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -38,9 +38,6 @@ extern int __cpu_logical_map[NR_CPUS]; #define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */ #define SMP_CALL_FUNCTION 0x2 -extern cpumask_t phys_cpu_present_map; -#define cpu_possible_map phys_cpu_present_map - extern void asmlinkage smp_bootstrap(void); /* diff --git a/arch/mips/jazz/irq.c b/arch/mips/jazz/irq.c index d7f8a782aae..03965cb1b25 100644 --- a/arch/mips/jazz/irq.c +++ b/arch/mips/jazz/irq.c @@ -146,7 +146,7 @@ void __init plat_time_init(void) BUG_ON(HZ != 100); - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); clockevents_register_device(cd); action->dev_id = cd; setup_irq(JAZZ_TIMER_IRQ, action); diff --git a/arch/mips/kernel/cevt-bcm1480.c b/arch/mips/kernel/cevt-bcm1480.c index 0a57f86945f..b820661678b 100644 --- a/arch/mips/kernel/cevt-bcm1480.c +++ b/arch/mips/kernel/cevt-bcm1480.c @@ -126,7 +126,7 @@ void __cpuinit sb1480_clockevent_init(void) cd->min_delta_ns = clockevent_delta2ns(2, cd); cd->rating = 200; cd->irq = irq; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = sibyte_next_event; cd->set_mode = sibyte_set_mode; clockevents_register_device(cd); @@ -148,6 +148,6 @@ void __cpuinit sb1480_clockevent_init(void) action->name = name; action->dev_id = cd; - irq_set_affinity(irq, cpumask_of_cpu(cpu)); + irq_set_affinity(irq, cpumask_of(cpu)); setup_irq(irq, action); } diff --git a/arch/mips/kernel/cevt-ds1287.c b/arch/mips/kernel/cevt-ds1287.c index df4acb68bfb..1ada45ea070 100644 --- a/arch/mips/kernel/cevt-ds1287.c +++ b/arch/mips/kernel/cevt-ds1287.c @@ -88,7 +88,6 @@ static void ds1287_event_handler(struct clock_event_device *dev) static struct clock_event_device ds1287_clockevent = { .name = "ds1287", .features = CLOCK_EVT_FEAT_PERIODIC, - .cpumask = CPU_MASK_CPU0, .set_next_event = ds1287_set_next_event, .set_mode = ds1287_set_mode, .event_handler = ds1287_event_handler, @@ -122,6 +121,7 @@ int __init ds1287_clockevent_init(int irq) clockevent_set_clock(cd, 32768); cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd); cd->min_delta_ns = clockevent_delta2ns(0x300, cd); + cd->cpumask = cpumask_of(0); clockevents_register_device(&ds1287_clockevent); diff --git a/arch/mips/kernel/cevt-gt641xx.c b/arch/mips/kernel/cevt-gt641xx.c index 6e2f58520af..e9b787feedc 100644 --- a/arch/mips/kernel/cevt-gt641xx.c +++ b/arch/mips/kernel/cevt-gt641xx.c @@ -96,7 +96,6 @@ static void gt641xx_timer0_event_handler(struct clock_event_device *dev) static struct clock_event_device gt641xx_timer0_clockevent = { .name = "gt641xx-timer0", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .cpumask = CPU_MASK_CPU0, .irq = GT641XX_TIMER0_IRQ, .set_next_event = gt641xx_timer0_set_next_event, .set_mode = gt641xx_timer0_set_mode, @@ -132,6 +131,7 @@ static int __init gt641xx_timer0_clockevent_init(void) clockevent_set_clock(cd, gt641xx_base_clock); cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd); cd->min_delta_ns = clockevent_delta2ns(0x300, cd); + cd->cpumask = cpumask_of(0); clockevents_register_device(>641xx_timer0_clockevent); diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c index 4a4c59f2737..e1ec83b6803 100644 --- a/arch/mips/kernel/cevt-r4k.c +++ b/arch/mips/kernel/cevt-r4k.c @@ -195,7 +195,7 @@ int __cpuinit mips_clockevent_init(void) cd->rating = 300; cd->irq = irq; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = mips_next_event; cd->set_mode = mips_set_clock_mode; cd->event_handler = mips_event_handler; diff --git a/arch/mips/kernel/cevt-sb1250.c b/arch/mips/kernel/cevt-sb1250.c index 63ac3ad462b..a2eebaafda5 100644 --- a/arch/mips/kernel/cevt-sb1250.c +++ b/arch/mips/kernel/cevt-sb1250.c @@ -125,7 +125,7 @@ void __cpuinit sb1250_clockevent_init(void) cd->min_delta_ns = clockevent_delta2ns(2, cd); cd->rating = 200; cd->irq = irq; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = sibyte_next_event; cd->set_mode = sibyte_set_mode; clockevents_register_device(cd); @@ -147,6 +147,6 @@ void __cpuinit sb1250_clockevent_init(void) action->name = name; action->dev_id = cd; - irq_set_affinity(irq, cpumask_of_cpu(cpu)); + irq_set_affinity(irq, cpumask_of(cpu)); setup_irq(irq, action); } diff --git a/arch/mips/kernel/cevt-smtc.c b/arch/mips/kernel/cevt-smtc.c index 5162fe4b595..6d45e24db5b 100644 --- a/arch/mips/kernel/cevt-smtc.c +++ b/arch/mips/kernel/cevt-smtc.c @@ -292,7 +292,7 @@ int __cpuinit mips_clockevent_init(void) cd->rating = 300; cd->irq = irq; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = mips_next_event; cd->set_mode = mips_set_clock_mode; cd->event_handler = mips_event_handler; diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c index b5fc4eb412d..eccf7d6096b 100644 --- a/arch/mips/kernel/cevt-txx9.c +++ b/arch/mips/kernel/cevt-txx9.c @@ -112,7 +112,6 @@ static struct clock_event_device txx9tmr_clock_event_device = { .name = "TXx9", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .rating = 200, - .cpumask = CPU_MASK_CPU0, .set_mode = txx9tmr_set_mode, .set_next_event = txx9tmr_set_next_event, }; @@ -150,6 +149,7 @@ void __init txx9_clockevent_init(unsigned long baseaddr, int irq, clockevent_delta2ns(0xffffffff >> (32 - TXX9_TIMER_BITS), cd); cd->min_delta_ns = clockevent_delta2ns(0xf, cd); cd->irq = irq; + cd->cpumask = cpumask_of(0), clockevents_register_device(cd); setup_irq(irq, &txx9tmr_irq); printk(KERN_INFO "TXx9: clockevent device at 0x%lx, irq %d\n", diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index b6ac55162b9..f4d187825f9 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -115,7 +115,7 @@ void __init setup_pit_timer(void) * Start pit with the boot cpu mask and make it global after the * IO_APIC has been initialized. */ - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); clockevent_set_clock(cd, CLOCK_TICK_RATE); cd->max_delta_ns = clockevent_delta2ns(0x7FFF, cd); cd->min_delta_ns = clockevent_delta2ns(0xF, cd); diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c index f0a4bb19e09..494a49a317e 100644 --- a/arch/mips/kernel/irq-gic.c +++ b/arch/mips/kernel/irq-gic.c @@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq) static DEFINE_SPINLOCK(gic_lock); -static void gic_set_affinity(unsigned int irq, cpumask_t cpumask) +static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { cpumask_t tmp = CPU_MASK_NONE; unsigned long flags; @@ -164,7 +164,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask) pr_debug(KERN_DEBUG "%s called\n", __func__); irq -= _irqbase; - cpus_and(tmp, cpumask, cpu_online_map); + cpumask_and(&tmp, cpumask, cpu_online_mask); if (cpus_empty(tmp)) return; @@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask) set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask); } - irq_desc[irq].affinity = cpumask; + irq_desc[irq].affinity = *cpumask; spin_unlock_irqrestore(&gic_lock, flags); } diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c index ca476c4f62a..f27beca4b26 100644 --- a/arch/mips/kernel/smp-cmp.c +++ b/arch/mips/kernel/smp-cmp.c @@ -51,10 +51,10 @@ static int __init allowcpus(char *str) int len; cpus_clear(cpu_allow_map); - if (cpulist_parse(str, cpu_allow_map) == 0) { + if (cpulist_parse(str, &cpu_allow_map) == 0) { cpu_set(0, cpu_allow_map); cpus_and(cpu_possible_map, cpu_possible_map, cpu_allow_map); - len = cpulist_scnprintf(buf, sizeof(buf)-1, cpu_possible_map); + len = cpulist_scnprintf(buf, sizeof(buf)-1, &cpu_possible_map); buf[len] = '\0'; pr_debug("Allowable CPUs: %s\n", buf); return 1; @@ -226,7 +226,7 @@ void __init cmp_smp_setup(void) for (i = 1; i < NR_CPUS; i++) { if (amon_cpu_avail(i)) { - cpu_set(i, phys_cpu_present_map); + cpu_set(i, cpu_possible_map); __cpu_number_map[i] = ++ncpu; __cpu_logical_map[ncpu] = i; } diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c index 87a1816c1f4..6f7ee5ac46e 100644 --- a/arch/mips/kernel/smp-mt.c +++ b/arch/mips/kernel/smp-mt.c @@ -70,7 +70,7 @@ static unsigned int __init smvp_vpe_init(unsigned int tc, unsigned int mvpconf0, write_vpe_c0_vpeconf0(tmp); /* Record this as available CPU */ - cpu_set(tc, phys_cpu_present_map); + cpu_set(tc, cpu_possible_map); __cpu_number_map[tc] = ++ncpu; __cpu_logical_map[ncpu] = tc; } diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 8bf88faf5af..3da94704f81 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -44,15 +44,10 @@ #include <asm/mipsmtregs.h> #endif /* CONFIG_MIPS_MT_SMTC */ -cpumask_t phys_cpu_present_map; /* Bitmask of available CPUs */ volatile cpumask_t cpu_callin_map; /* Bitmask of started secondaries */ -cpumask_t cpu_online_map; /* Bitmask of currently online CPUs */ int __cpu_number_map[NR_CPUS]; /* Map physical to logical */ int __cpu_logical_map[NR_CPUS]; /* Map logical to physical */ -EXPORT_SYMBOL(phys_cpu_present_map); -EXPORT_SYMBOL(cpu_online_map); - extern void cpu_idle(void); /* Number of TCs (or siblings in Intel speak) per CPU core */ @@ -195,7 +190,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* preload SMP state for boot cpu */ void __devinit smp_prepare_boot_cpu(void) { - cpu_set(0, phys_cpu_present_map); + cpu_set(0, cpu_possible_map); cpu_set(0, cpu_online_map); cpu_set(0, cpu_callin_map); } diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 897fb2b4751..b6cca01ff82 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -290,7 +290,7 @@ static void smtc_configure_tlb(void) * possibly leave some TCs/VPEs as "slave" processors. * * Use c0_MVPConf0 to find out how many TCs are available, setting up - * phys_cpu_present_map and the logical/physical mappings. + * cpu_possible_map and the logical/physical mappings. */ int __init smtc_build_cpu_map(int start_cpu_slot) @@ -304,7 +304,7 @@ int __init smtc_build_cpu_map(int start_cpu_slot) */ ntcs = ((read_c0_mvpconf0() & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1; for (i=start_cpu_slot; i<NR_CPUS && i<ntcs; i++) { - cpu_set(i, phys_cpu_present_map); + cpu_set(i, cpu_possible_map); __cpu_number_map[i] = i; __cpu_logical_map[i] = i; } @@ -521,7 +521,7 @@ void smtc_prepare_cpus(int cpus) * Pull any physically present but unused TCs out of circulation. */ while (tc < (((val & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1)) { - cpu_clear(tc, phys_cpu_present_map); + cpu_clear(tc, cpu_possible_map); cpu_clear(tc, cpu_present_map); tc++; } diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c index f84a46a8ae6..aabd7274507 100644 --- a/arch/mips/mti-malta/malta-smtc.c +++ b/arch/mips/mti-malta/malta-smtc.c @@ -114,9 +114,9 @@ struct plat_smp_ops msmtc_smp_ops = { */ -void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity) +void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) { - cpumask_t tmask = affinity; + cpumask_t tmask = *affinity; int cpu = 0; void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff); @@ -139,7 +139,7 @@ void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity) * be made to forward to an offline "CPU". */ - for_each_cpu_mask(cpu, affinity) { + for_each_cpu(cpu, affinity) { if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu)) cpu_clear(cpu, tmask); } diff --git a/arch/mips/nxp/pnx8550/common/time.c b/arch/mips/nxp/pnx8550/common/time.c index 62f495b57f9..cf293b27909 100644 --- a/arch/mips/nxp/pnx8550/common/time.c +++ b/arch/mips/nxp/pnx8550/common/time.c @@ -102,6 +102,7 @@ __init void plat_time_init(void) unsigned int p; unsigned int pow2p; + pnx8xxx_clockevent.cpumask = cpu_none_mask; clockevents_register_device(&pnx8xxx_clockevent); clocksource_register(&pnx_clocksource); diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index 3a7df647ca7..f78c29b68d7 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -141,7 +141,7 @@ static void __cpuinit yos_boot_secondary(int cpu, struct task_struct *idle) } /* - * Detect available CPUs, populate phys_cpu_present_map before smp_init + * Detect available CPUs, populate cpu_possible_map before smp_init * * We don't want to start the secondary CPU yet nor do we have a nice probing * feature in PMON so we just assume presence of the secondary core. @@ -150,10 +150,10 @@ static void __init yos_smp_setup(void) { int i; - cpus_clear(phys_cpu_present_map); + cpus_clear(cpu_possible_map); for (i = 0; i < 2; i++) { - cpu_set(i, phys_cpu_present_map); + cpu_set(i, cpu_possible_map); __cpu_number_map[i] = i; __cpu_logical_map[i] = i; } diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c index ba5cdebeaf0..5b47d6b6527 100644 --- a/arch/mips/sgi-ip27/ip27-smp.c +++ b/arch/mips/sgi-ip27/ip27-smp.c @@ -76,7 +76,7 @@ static int do_cpumask(cnodeid_t cnode, nasid_t nasid, int highest) /* Only let it join in if it's marked enabled */ if ((acpu->cpu_info.flags & KLINFO_ENABLE) && (tot_cpus_found != NR_CPUS)) { - cpu_set(cpuid, phys_cpu_present_map); + cpu_set(cpuid, cpu_possible_map); alloc_cpupda(cpuid, tot_cpus_found); cpus_found++; tot_cpus_found++; diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c index 1327c2746fb..f024057a35f 100644 --- a/arch/mips/sgi-ip27/ip27-timer.c +++ b/arch/mips/sgi-ip27/ip27-timer.c @@ -134,7 +134,7 @@ void __cpuinit hub_rt_clock_event_init(void) cd->min_delta_ns = clockevent_delta2ns(0x300, cd); cd->rating = 200; cd->irq = irq; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = rt_next_event; cd->set_mode = rt_set_mode; clockevents_register_device(cd); diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c index a35818ed426..12b465d404d 100644 --- a/arch/mips/sibyte/bcm1480/irq.c +++ b/arch/mips/sibyte/bcm1480/irq.c @@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq); static void disable_bcm1480_irq(unsigned int irq); static void ack_bcm1480_irq(unsigned int irq); #ifdef CONFIG_SMP -static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask); +static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask); #endif #ifdef CONFIG_PCI @@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq) } #ifdef CONFIG_SMP -static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask) +static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) { int i = 0, old_cpu, cpu, int_on, k; u64 cur_ints; @@ -117,11 +117,11 @@ static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask) unsigned long flags; unsigned int irq_dirty; - if (cpus_weight(mask) != 1) { + if (cpumask_weight(mask) != 1) { printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); return; } - i = first_cpu(mask); + i = cpumask_first(mask); /* Convert logical CPU to physical CPU */ cpu = cpu_logical_map(i); diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index bd9eeb43ed0..dddfda8e829 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -136,7 +136,7 @@ static void __cpuinit bcm1480_boot_secondary(int cpu, struct task_struct *idle) /* * Use CFE to find out how many CPUs are available, setting up - * phys_cpu_present_map and the logical/physical mappings. + * cpu_possible_map and the logical/physical mappings. * XXXKW will the boot CPU ever not be physical 0? * * Common setup before any secondaries are started @@ -145,14 +145,14 @@ static void __init bcm1480_smp_setup(void) { int i, num; - cpus_clear(phys_cpu_present_map); - cpu_set(0, phys_cpu_present_map); + cpus_clear(cpu_possible_map); + cpu_set(0, cpu_possible_map); __cpu_number_map[0] = 0; __cpu_logical_map[0] = 0; for (i = 1, num = 0; i < NR_CPUS; i++) { if (cfe_cpu_stop(i) == 0) { - cpu_set(i, phys_cpu_present_map); + cpu_set(i, cpu_possible_map); __cpu_number_map[i] = ++num; __cpu_logical_map[num] = i; } diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c index a5158483986..808ac2959b8 100644 --- a/arch/mips/sibyte/sb1250/irq.c +++ b/arch/mips/sibyte/sb1250/irq.c @@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq); static void disable_sb1250_irq(unsigned int irq); static void ack_sb1250_irq(unsigned int irq); #ifdef CONFIG_SMP -static void sb1250_set_affinity(unsigned int irq, cpumask_t mask); +static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask); #endif #ifdef CONFIG_SIBYTE_HAS_LDT @@ -103,16 +103,16 @@ void sb1250_unmask_irq(int cpu, int irq) } #ifdef CONFIG_SMP -static void sb1250_set_affinity(unsigned int irq, cpumask_t mask) +static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) { int i = 0, old_cpu, cpu, int_on; u64 cur_ints; struct irq_desc *desc = irq_desc + irq; unsigned long flags; - i = first_cpu(mask); + i = cpumask_first(mask); - if (cpus_weight(mask) > 1) { + if (cpumask_weight(mask) > 1) { printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); return; } diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index 0734b933e96..5950a288a7d 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -124,7 +124,7 @@ static void __cpuinit sb1250_boot_secondary(int cpu, struct task_struct *idle) /* * Use CFE to find out how many CPUs are available, setting up - * phys_cpu_present_map and the logical/physical mappings. + * cpu_possible_map and the logical/physical mappings. * XXXKW will the boot CPU ever not be physical 0? * * Common setup before any secondaries are started @@ -133,14 +133,14 @@ static void __init sb1250_smp_setup(void) { int i, num; - cpus_clear(phys_cpu_present_map); - cpu_set(0, phys_cpu_present_map); + cpus_clear(cpu_possible_map); + cpu_set(0, cpu_possible_map); __cpu_number_map[0] = 0; __cpu_logical_map[0] = 0; for (i = 1, num = 0; i < NR_CPUS; i++) { if (cfe_cpu_stop(i) == 0) { - cpu_set(i, phys_cpu_present_map); + cpu_set(i, cpu_possible_map); __cpu_number_map[i] = ++num; __cpu_logical_map[num] = i; } diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index 796e3ce2872..69f5f88711c 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -80,7 +80,7 @@ static void __init sni_a20r_timer_setup(void) struct irqaction *action = &a20r_irqaction; unsigned int cpu = smp_processor_id(); - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); clockevents_register_device(cd); action->dev_id = cd; setup_irq(SNI_A20R_IRQ_TIMER, &a20r_irqaction); diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 644a70b1b04..aacf11d3372 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -11,6 +11,7 @@ config PARISC select HAVE_OPROFILE select RTC_CLASS select RTC_DRV_PARISC + select INIT_ALL_POSSIBLE help The PA-RISC microprocessor is designed by Hewlett-Packard and used in many of their workstations & servers (HP9000 700 and 800 series, diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 23ef950df00..4cea935e2f9 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -131,12 +131,12 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest) return 0; } -static void cpu_set_affinity_irq(unsigned int irq, cpumask_t dest) +static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest) { - if (cpu_check_affinity(irq, &dest)) + if (cpu_check_affinity(irq, dest)) return; - irq_desc[irq].affinity = dest; + irq_desc[irq].affinity = *dest; } #endif diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index d47f3975c9c..80bc000523f 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -67,21 +67,6 @@ static volatile int cpu_now_booting __read_mostly = 0; /* track which CPU is boo static int parisc_max_cpus __read_mostly = 1; -/* online cpus are ones that we've managed to bring up completely - * possible cpus are all valid cpu - * present cpus are all detected cpu - * - * On startup we bring up the "possible" cpus. Since we discover - * CPUs later, we add them as hotplug, so the possible cpu mask is - * empty in the beginning. - */ - -cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE; /* Bitmap of online CPUs */ -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; /* Bitmap of Present CPUs */ - -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(cpu_possible_map); - DEFINE_PER_CPU(spinlock_t, ipi_lock) = SPIN_LOCK_UNLOCKED; enum ipi_message_type { diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 525c13a4de9..adb23ea1c1e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,7 +141,7 @@ config GENERIC_NVRAM bool default y if PPC32 -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b298f7a631e..e5f2ae8362f 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -7,7 +7,19 @@ #ifndef __ASSEMBLY__ extern void _mcount(void); -#endif + +#ifdef CONFIG_DYNAMIC_FTRACE +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* reloction of mcount call site is the same as the address */ + return addr; +} + +struct dyn_arch_ftrace { + struct module *mod; +}; +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index e5f14b13ccf..08454880a2c 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -34,11 +34,19 @@ struct mod_arch_specific { #ifdef __powerpc64__ unsigned int stubs_section; /* Index of stubs section in module */ unsigned int toc_section; /* What section is the TOC? */ -#else +#ifdef CONFIG_DYNAMIC_FTRACE + unsigned long toc; + unsigned long tramp; +#endif + +#else /* powerpc64 */ /* Indices of PLT sections within module. */ unsigned int core_plt_section; unsigned int init_plt_section; +#ifdef CONFIG_DYNAMIC_FTRACE + unsigned long tramp; #endif +#endif /* powerpc64 */ /* List of BUG addresses, source line numbers and filenames */ struct list_head bug_list; @@ -68,6 +76,12 @@ struct mod_arch_specific { # endif /* MODULE */ #endif +#ifdef CONFIG_DYNAMIC_FTRACE +# ifdef MODULE + asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous"); +# endif /* MODULE */ +#endif + struct exception_table_entry; void sort_ex_table(struct exception_table_entry *start, diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c32da6f9799..373fca394a5 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -48,7 +48,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) /* sched_domains SD_NODE_INIT for PPC64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 92673b43858..d17edb4a2f9 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog ifdef CONFIG_DYNAMIC_FTRACE # dynamic ftrace setup. diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 7ecc0d1855c..6f7eb7e00c7 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1162,39 +1162,17 @@ machine_check_in_rtas: #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) - stwu r1,-48(r1) - stw r3, 12(r1) - stw r4, 16(r1) - stw r5, 20(r1) - stw r6, 24(r1) - mflr r3 - stw r7, 28(r1) - mfcr r5 - stw r8, 32(r1) - stw r9, 36(r1) - stw r10,40(r1) - stw r3, 44(r1) - stw r5, 8(r1) - subi r3, r3, MCOUNT_INSN_SIZE - .globl mcount_call -mcount_call: - bl ftrace_stub - nop - lwz r6, 8(r1) - lwz r0, 44(r1) - lwz r3, 12(r1) + /* + * It is required that _mcount on PPC32 must preserve the + * link register. But we have r0 to play with. We use r0 + * to push the return address back to the caller of mcount + * into the ctr register, restore the link register and + * then jump back using the ctr register. + */ + mflr r0 mtctr r0 - lwz r4, 16(r1) - mtcr r6 - lwz r5, 20(r1) - lwz r6, 24(r1) - lwz r0, 52(r1) - lwz r7, 28(r1) - lwz r8, 32(r1) + lwz r0, 4(r1) mtlr r0 - lwz r9, 36(r1) - lwz r10,40(r1) - addi r1, r1, 48 bctr _GLOBAL(ftrace_caller) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e0bcf935428..383ed6eb008 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -894,18 +894,6 @@ _GLOBAL(enter_prom) #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) - /* Taken from output of objdump from lib64/glibc */ - mflr r3 - stdu r1, -112(r1) - std r3, 128(r1) - subi r3, r3, MCOUNT_INSN_SIZE - .globl mcount_call -mcount_call: - bl ftrace_stub - nop - ld r0, 128(r1) - mtlr r0 - addi r1, r1, 112 blr _GLOBAL(ftrace_caller) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index f4b006ed0ab..5355244c99f 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -9,22 +9,30 @@ #include <linux/spinlock.h> #include <linux/hardirq.h> +#include <linux/uaccess.h> +#include <linux/module.h> #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/list.h> #include <asm/cacheflush.h> +#include <asm/code-patching.h> #include <asm/ftrace.h> +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) do { } while (0) +#endif -static unsigned int ftrace_nop = 0x60000000; +static unsigned int ftrace_nop = PPC_NOP_INSTR; #ifdef CONFIG_PPC32 # define GET_ADDR(addr) addr #else /* PowerPC64's functions are data that points to the functions */ -# define GET_ADDR(addr) *(unsigned long *)addr +# define GET_ADDR(addr) (*(unsigned long *)addr) #endif @@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr) return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) +static unsigned char *ftrace_nop_replace(void) { return (char *)&ftrace_nop; } -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static unsigned int op; @@ -68,49 +76,422 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) # define _ASM_PTR " .long " #endif -int +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned replaced; - unsigned old = *(unsigned *)old_code; - unsigned new = *(unsigned *)new_code; - int faulted = 0; + unsigned char replaced[MCOUNT_INSN_SIZE]; /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting - * as well as code changing. + * as well as code changing. We do this by using the + * probe_kernel_* functions. * * No real locking needed, this code is run through - * kstop_machine. + * kstop_machine, or before SMP starts. */ - asm volatile ( - "1: lwz %1, 0(%2)\n" - " cmpw %1, %5\n" - " bne 2f\n" - " stwu %3, 0(%2)\n" - "2:\n" - ".section .fixup, \"ax\"\n" - "3: li %0, 1\n" - " b 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - _ASM_ALIGN "\n" - _ASM_PTR "1b, 3b\n" - ".previous" - : "=r"(faulted), "=r"(replaced) - : "r"(ip), "r"(new), - "0"(faulted), "r"(old) - : "memory"); - - if (replaced != old && replaced != new) - faulted = 2; - - if (!faulted) - flush_icache_range(ip, ip + 8); - - return faulted; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + /* replace the text with the new text */ + if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + + flush_icache_range(ip, ip + 8); + + return 0; +} + +/* + * Helper functions that are the same for both PPC64 and PPC32. + */ +static int test_24bit_addr(unsigned long ip, unsigned long addr) +{ + + /* use the create_branch to verify that this offset can be branched */ + return create_branch((unsigned int *)ip, addr, 0); +} + +static int is_bl_op(unsigned int op) +{ + return (op & 0xfc000003) == 0x48000001; +} + +static unsigned long find_bl_target(unsigned long ip, unsigned int op) +{ + static int offset; + + offset = (op & 0x03fffffc); + /* make it signed */ + if (offset & 0x02000000) + offset |= 0xfe000000; + + return ip + (long)offset; +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + unsigned int jmp[5]; + unsigned long ptr; + unsigned long ip = rec->ip; + unsigned long tramp; + int offset; + + /* read where this goes */ + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) + return -EFAULT; + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + + /* + * On PPC64 the trampoline looks like: + * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, <high> + * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, <low> + * Where the bytes 2,3,6 and 7 make up the 32bit offset + * to the TOC that holds the pointer. + * to jump to. + * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1) + * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12) + * The actually address is 32 bytes from the offset + * into the TOC. + * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12) + */ + + DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc); + + /* Find where the trampoline jumps to */ + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { + printk(KERN_ERR "Failed to read %lx\n", tramp); + return -EFAULT; + } + + DEBUGP(" %08x %08x", jmp[0], jmp[1]); + + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d820000) || + ((jmp[1] & 0xffff0000) != 0x398c0000) || + (jmp[2] != 0xf8410028) || + (jmp[3] != 0xe96c0020) || + (jmp[4] != 0xe84c0028)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; + } + + offset = (unsigned)((unsigned short)jmp[0]) << 16 | + (unsigned)((unsigned short)jmp[1]); + + DEBUGP(" %x ", offset); + + /* get the address this jumps too */ + tramp = mod->arch.toc + offset + 32; + DEBUGP("toc: %lx", tramp); + + if (probe_kernel_read(jmp, (void *)tramp, 8)) { + printk(KERN_ERR "Failed to read %lx\n", tramp); + return -EFAULT; + } + + DEBUGP(" %08x %08x\n", jmp[0], jmp[1]); + + ptr = ((unsigned long)jmp[0] << 32) + jmp[1]; + + /* This should match what was called */ + if (ptr != GET_ADDR(addr)) { + printk(KERN_ERR "addr does not match %lx\n", ptr); + return -EINVAL; + } + + /* + * We want to nop the line, but the next line is + * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1) + * This needs to be turned to a nop too. + */ + if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (op != 0xe8410028) { + printk(KERN_ERR "Next line is not ld! (%08x)\n", op); + return -EINVAL; + } + + /* + * Milton Miller pointed out that we can not blindly do nops. + * If a task was preempted when calling a trace function, + * the nops will remove the way to restore the TOC in r2 + * and the r2 TOC will get corrupted. + */ + + /* + * Replace: + * bl <tramp> <==== will be replaced with "b 1f" + * ld r2,40(r1) + * 1: + */ + op = 0x48000008; /* b +8 */ + + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) + return -EPERM; + + + flush_icache_range(ip, ip + 8); + + return 0; +} + +#else /* !PPC64 */ +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + unsigned int jmp[4]; + unsigned long ip = rec->ip; + unsigned long tramp; + + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + + /* + * On PPC32 the trampoline looks like: + * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha + * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l + * 0x7d, 0x69, 0x03, 0xa6 mtctr r11 + * 0x4e, 0x80, 0x04, 0x20 bctr + */ + + DEBUGP("ip:%lx jumps to %lx", ip, tramp); + + /* Find where the trampoline jumps to */ + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { + printk(KERN_ERR "Failed to read %lx\n", tramp); + return -EFAULT; + } + + DEBUGP(" %08x %08x ", jmp[0], jmp[1]); + + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d600000) || + ((jmp[1] & 0xffff0000) != 0x396b0000) || + (jmp[2] != 0x7d6903a6) || + (jmp[3] != 0x4e800420)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; + } + + tramp = (jmp[1] & 0xffff) | + ((jmp[0] & 0xffff) << 16); + if (tramp & 0x8000) + tramp -= 0x10000; + + DEBUGP(" %x ", tramp); + + if (tramp != addr) { + printk(KERN_ERR + "Trampoline location %08lx does not match addr\n", + tramp); + return -EINVAL; + } + + op = PPC_NOP_INSTR; + + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) + return -EPERM; + + flush_icache_range(ip, ip + 8); + + return 0; +} +#endif /* PPC64 */ + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *old, *new; + unsigned long ip = rec->ip; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + return ftrace_modify_code(ip, old, new); + } + + /* + * Out of range jumps are called from modules. + * We should either already have a pointer to the module + * or it has been passed in. + */ + if (!rec->arch.mod) { + if (!mod) { + printk(KERN_ERR "No module loaded addr=%lx\n", + addr); + return -EFAULT; + } + rec->arch.mod = mod; + } else if (mod) { + if (mod != rec->arch.mod) { + printk(KERN_ERR + "Record mod %p not equal to passed in mod %p\n", + rec->arch.mod, mod); + return -EINVAL; + } + /* nothing to do if mod == rec->arch.mod */ + } else + mod = rec->arch.mod; + + return __ftrace_make_nop(mod, rec, addr); + +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op[2]; + unsigned long ip = rec->ip; + + /* read where this goes */ + if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2)) + return -EFAULT; + + /* + * It should be pointing to two nops or + * b +8; ld r2,40(r1) + */ + if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) && + ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) { + printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]); + return -EINVAL; + } + + /* If we never set up a trampoline to ftrace_caller, then bail */ + if (!rec->arch.mod->arch.tramp) { + printk(KERN_ERR "No ftrace trampoline\n"); + return -EINVAL; + } + + /* create the branch to the trampoline */ + op[0] = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op[0]) { + printk(KERN_ERR "REL24 out of range!\n"); + return -EINVAL; + } + + /* ld r2,40(r1) */ + op[1] = 0xe8410028; + + DEBUGP("write to %lx\n", rec->ip); + + if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2)) + return -EPERM; + + flush_icache_range(ip, ip + 8); + + return 0; +} +#else +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + unsigned long ip = rec->ip; + + /* read where this goes */ + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* It should be pointing to a nop */ + if (op != PPC_NOP_INSTR) { + printk(KERN_ERR "Expected NOP but have %x\n", op); + return -EINVAL; + } + + /* If we never set up a trampoline to ftrace_caller, then bail */ + if (!rec->arch.mod->arch.tramp) { + printk(KERN_ERR "No ftrace trampoline\n"); + return -EINVAL; + } + + /* create the branch to the trampoline */ + op = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op) { + printk(KERN_ERR "REL24 out of range!\n"); + return -EINVAL; + } + + DEBUGP("write to %lx\n", rec->ip); + + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) + return -EPERM; + + flush_icache_range(ip, ip + 8); + + return 0; +} +#endif /* CONFIG_PPC64 */ + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *old, *new; + unsigned long ip = rec->ip; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new); + } + + /* + * Out of range jumps are called from modules. + * Being that we are converting from nop, it had better + * already have a module defined. + */ + if (!rec->arch.mod) { + printk(KERN_ERR "No module loaded\n"); + return -EINVAL; + } + + return __ftrace_make_call(rec, addr); } int ftrace_update_ftrace_func(ftrace_func_t func) @@ -128,10 +509,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func) int __init ftrace_dyn_arch_init(void *data) { - /* This is running in kstop_machine */ + /* caller expects data to be zero */ + unsigned long *p = data; - ftrace_mcount_set(data); + *p = 0; return 0; } - diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 31982d05d81..88d9c1d5e5f 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -69,10 +69,15 @@ void cpu_idle(void) smp_mb(); local_irq_disable(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + /* check again after disabling irqs */ if (!need_resched() && !cpu_should_die()) ppc_md.power_save(); + start_critical_timings(); + local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ac222d0ab12..23b8b5e36f9 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -237,7 +237,7 @@ void fixup_irqs(cpumask_t map) mask = map; } if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + irq_desc[irq].chip->set_affinity(irq, &mask); else if (irq_desc[irq].action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 2df91a03462..f832773fc28 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/ftrace.h> #include <linux/cache.h> #include <linux/bug.h> #include <linux/sort.h> @@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num) r_addend = rela[i].r_addend; } +#ifdef CONFIG_DYNAMIC_FTRACE + _count_relocs++; /* add one for ftrace_caller */ +#endif return _count_relocs; } @@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, return -ENOEXEC; } } +#ifdef CONFIG_DYNAMIC_FTRACE + module->arch.tramp = + do_plt_call(module->module_core, + (unsigned long)ftrace_caller, + sechdrs, module); +#endif return 0; } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 1af2377e499..8992b031a7b 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -20,6 +20,7 @@ #include <linux/moduleloader.h> #include <linux/err.h> #include <linux/vmalloc.h> +#include <linux/ftrace.h> #include <linux/bug.h> #include <asm/module.h> #include <asm/firmware.h> @@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, } } +#ifdef CONFIG_DYNAMIC_FTRACE + /* make the trampoline to the ftrace_caller */ + relocs++; +#endif + DEBUGP("Looks like a total of %lu stubs, max\n", relocs); return relocs * sizeof(struct ppc64_stub_entry); } @@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, } } +#ifdef CONFIG_DYNAMIC_FTRACE + me->arch.toc = my_r2(sechdrs, me); + me->arch.tramp = stub_for_addr(sechdrs, + (unsigned long)ftrace_caller, + me); +#endif + return 0; } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ff9f7010097..d1165566f06 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -60,13 +60,9 @@ int smp_hw_index[NR_CPUS]; struct thread_info *secondary_ti; -cpumask_t cpu_possible_map = CPU_MASK_NONE; -cpumask_t cpu_online_map = CPU_MASK_NONE; DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE; DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(cpu_possible_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e2ee66b5831..6f39d35d6f5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -869,7 +869,7 @@ static void register_decrementer_clockevent(int cpu) struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; *dec = decrementer_clockevent; - dec->cpumask = cpumask_of_cpu(cpu); + dec->cpumask = cpumask_of(cpu); printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d69912c07ce..8db35278a4b 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y) EXTRA_CFLAGS += -mno-minimal-toc endif +CFLAGS_REMOVE_code-patching.o = -pg +CFLAGS_REMOVE_feature-fixups.o = -pg + obj-y := string.o alloc.o \ checksum_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index e1904774a70..424b335a71c 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -332,7 +332,7 @@ static void xics_eoi_lpar(unsigned int virq) lpar_xirr_info_set((0xff << 24) | irq); } -static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) +static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) { unsigned int irq; int status; @@ -845,7 +845,7 @@ void xics_migrate_irqs_away(void) /* Reset affinity to all cpus */ irq_desc[virq].affinity = CPU_MASK_ALL; - desc->chip->set_affinity(virq, CPU_MASK_ALL); + desc->chip->set_affinity(virq, cpu_all_mask); unlock: spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 1890fb085cd..5d7f9f0c93c 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -817,7 +817,7 @@ static void mpic_end_ipi(unsigned int irq) #endif /* CONFIG_SMP */ -void mpic_set_affinity(unsigned int irq, cpumask_t cpumask) +void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct mpic *mpic = mpic_from_irq(irq); unsigned int src = mpic_irq_to_hw(irq); @@ -829,7 +829,7 @@ void mpic_set_affinity(unsigned int irq, cpumask_t cpumask) } else { cpumask_t tmp; - cpus_and(tmp, cpumask, cpu_online_map); + cpumask_and(&tmp, cpumask, cpu_online_mask); mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), mpic_physmask(cpus_addr(tmp)[0])); diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h index 6209c62a426..3cef2af10f4 100644 --- a/arch/powerpc/sysdev/mpic.h +++ b/arch/powerpc/sysdev/mpic.h @@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic) extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type); extern void mpic_set_vector(unsigned int virq, unsigned int vector); -extern void mpic_set_affinity(unsigned int irq, cpumask_t cpumask); +extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask); #endif /* _POWERPC_SYSDEV_MPIC_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8116a3328a1..b4aa5869c7f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,7 @@ config S390 select HAVE_KRETPROBES select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK + select INIT_ALL_POSSIBLE source "init/Kconfig" diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index b5595688a47..f03914b8ed2 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -52,12 +52,6 @@ struct _lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); -cpumask_t cpu_online_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); - static struct task_struct *current_set[NR_CPUS]; static u8 smp_cpu_type; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index eccefbbff88..f5bd141c844 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -154,7 +154,7 @@ void init_cpu_timer(void) cd->min_delta_ns = 1; cd->max_delta_ns = LONG_MAX; cd->rating = 400; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = s390_next_event; cd->set_mode = s390_set_mode; diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index a947899dcba..bf96f1b5c6e 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -212,7 +212,7 @@ static void update_cpu_core_map(void) cpu_core_map[cpu] = cpu_coregroup_map(cpu); } -void arch_update_cpu_topology(void) +int arch_update_cpu_topology(void) { struct tl_info *info = tl_info; struct sys_device *sysdev; @@ -221,7 +221,7 @@ void arch_update_cpu_topology(void) if (!machine_has_topology) { update_cpu_core_map(); topology_update_polarization_simple(); - return; + return 0; } stsi(info, 15, 1, 2); tl_to_cores(info); @@ -230,6 +230,7 @@ void arch_update_cpu_topology(void) sysdev = get_cpu_sysdev(cpu); kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); } + return 1; } static void topology_work_fn(struct work_struct *work) diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h index 85b660c17eb..c24e9c6a173 100644 --- a/arch/sh/include/asm/smp.h +++ b/arch/sh/include/asm/smp.h @@ -31,7 +31,7 @@ enum { }; void smp_message_recv(unsigned int msg); -void smp_timer_broadcast(cpumask_t mask); +void smp_timer_broadcast(const struct cpumask *mask); void local_timer_interrupt(void); void local_timer_setup(unsigned int cpu); diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 95f0085e098..279d9cc4a00 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -5,7 +5,6 @@ /* sched_domains SD_NODE_INIT for sh machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 3c5ad1660bb..8f402741261 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -31,12 +31,6 @@ int __cpu_number_map[NR_CPUS]; /* Map physical to logical */ int __cpu_logical_map[NR_CPUS]; /* Map logical to physical */ -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); - -cpumask_t cpu_online_map; -EXPORT_SYMBOL(cpu_online_map); - static inline void __init smp_store_cpu_info(unsigned int cpu) { struct sh_cpuinfo *c = cpu_data + cpu; @@ -190,11 +184,11 @@ void arch_send_call_function_single_ipi(int cpu) plat_send_ipi(cpu, SMP_MSG_FUNCTION_SINGLE); } -void smp_timer_broadcast(cpumask_t mask) +void smp_timer_broadcast(const struct cpumask *mask) { int cpu; - for_each_cpu_mask(cpu, mask) + for_each_cpu(cpu, mask) plat_send_ipi(cpu, SMP_MSG_TIMER); } diff --git a/arch/sh/kernel/timers/timer-broadcast.c b/arch/sh/kernel/timers/timer-broadcast.c index c2317635230..96e8eaea1e6 100644 --- a/arch/sh/kernel/timers/timer-broadcast.c +++ b/arch/sh/kernel/timers/timer-broadcast.c @@ -51,7 +51,7 @@ void __cpuinit local_timer_setup(unsigned int cpu) clk->mult = 1; clk->set_mode = dummy_timer_set_mode; clk->broadcast = smp_timer_broadcast; - clk->cpumask = cpumask_of_cpu(cpu); + clk->cpumask = cpumask_of(cpu); clockevents_register_device(clk); } diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c index 3c61ddd4d43..0db3f951033 100644 --- a/arch/sh/kernel/timers/timer-tmu.c +++ b/arch/sh/kernel/timers/timer-tmu.c @@ -263,7 +263,7 @@ static int tmu_timer_init(void) tmu0_clockevent.min_delta_ns = clockevent_delta2ns(1, &tmu0_clockevent); - tmu0_clockevent.cpumask = cpumask_of_cpu(0); + tmu0_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&tmu0_clockevent); diff --git a/arch/sparc/include/asm/smp_32.h b/arch/sparc/include/asm/smp_32.h index a8180e546a4..8408d9d2a66 100644 --- a/arch/sparc/include/asm/smp_32.h +++ b/arch/sparc/include/asm/smp_32.h @@ -29,8 +29,6 @@ */ extern unsigned char boot_cpu_id; -extern cpumask_t phys_cpu_present_map; -#define cpu_possible_map phys_cpu_present_map typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c index e396c1f17a9..1e5ac4e282e 100644 --- a/arch/sparc/kernel/smp.c +++ b/arch/sparc/kernel/smp.c @@ -39,8 +39,6 @@ volatile unsigned long cpu_callin_map[NR_CPUS] __cpuinitdata = {0,}; unsigned char boot_cpu_id = 0; unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */ -cpumask_t cpu_online_map = CPU_MASK_NONE; -cpumask_t phys_cpu_present_map = CPU_MASK_NONE; cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* The only guaranteed locking primitive available on all Sparc @@ -334,7 +332,7 @@ void __init smp_setup_cpu_possible_map(void) instance = 0; while (!cpu_find_by_instance(instance, NULL, &mid)) { if (mid < NR_CPUS) { - cpu_set(mid, phys_cpu_present_map); + cpu_set(mid, cpu_possible_map); cpu_set(mid, cpu_present_map); } instance++; @@ -354,7 +352,7 @@ void __init smp_prepare_boot_cpu(void) current_thread_info()->cpu = cpuid; cpu_set(cpuid, cpu_online_map); - cpu_set(cpuid, phys_cpu_present_map); + cpu_set(cpuid, cpu_possible_map); } int __cpuinit __cpu_up(unsigned int cpu) diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c index b0dfff84865..32d11a5fe3a 100644 --- a/arch/sparc/kernel/sparc_ksyms.c +++ b/arch/sparc/kernel/sparc_ksyms.c @@ -113,10 +113,6 @@ EXPORT_PER_CPU_SYMBOL(__cpu_data); #ifdef CONFIG_SMP /* IRQ implementation. */ EXPORT_SYMBOL(synchronize_irq); - -/* CPU online map and active count. */ -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(phys_cpu_present_map); #endif EXPORT_SYMBOL(__udelay); diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 52fc836f464..4aaf18e83c8 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -312,7 +312,8 @@ static void sun4u_irq_enable(unsigned int virt_irq) } } -static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask) +static void sun4u_set_affinity(unsigned int virt_irq, + const struct cpumask *mask) { sun4u_irq_enable(virt_irq); } @@ -362,7 +363,8 @@ static void sun4v_irq_enable(unsigned int virt_irq) ino, err); } -static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask) +static void sun4v_set_affinity(unsigned int virt_irq, + const struct cpumask *mask) { unsigned int ino = virt_irq_table[virt_irq].dev_ino; unsigned long cpuid = irq_choose_cpu(virt_irq); @@ -429,7 +431,8 @@ static void sun4v_virq_enable(unsigned int virt_irq) dev_handle, dev_ino, err); } -static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask) +static void sun4v_virt_set_affinity(unsigned int virt_irq, + const struct cpumask *mask) { unsigned long cpuid, dev_handle, dev_ino; int err; @@ -788,7 +791,7 @@ void fixup_irqs(void) !(irq_desc[irq].status & IRQ_PER_CPU)) { if (irq_desc[irq].chip->set_affinity) irq_desc[irq].chip->set_affinity(irq, - irq_desc[irq].affinity); + &irq_desc[irq].affinity); } spin_unlock_irqrestore(&irq_desc[irq].lock, flags); } diff --git a/arch/sparc64/kernel/of_device.c b/arch/sparc64/kernel/of_device.c index 0f616ae3246..df2efb7fc14 100644 --- a/arch/sparc64/kernel/of_device.c +++ b/arch/sparc64/kernel/of_device.c @@ -780,7 +780,7 @@ out: if (nid != -1) { cpumask_t numa_mask = node_to_cpumask(nid); - irq_set_affinity(irq, numa_mask); + irq_set_affinity(irq, &numa_mask); } return irq; diff --git a/arch/sparc64/kernel/pci_msi.c b/arch/sparc64/kernel/pci_msi.c index 2e680f34f72..0d0cd815e83 100644 --- a/arch/sparc64/kernel/pci_msi.c +++ b/arch/sparc64/kernel/pci_msi.c @@ -288,7 +288,7 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm, if (nid != -1) { cpumask_t numa_mask = node_to_cpumask(nid); - irq_set_affinity(irq, numa_mask); + irq_set_affinity(irq, &numa_mask); } err = request_irq(irq, sparc64_msiq_interrupt, 0, "MSIQ", diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index f500b0618bb..a97b8822c22 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -49,14 +49,10 @@ int sparc64_multi_core __read_mostly; -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE; -cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE; DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE; cpumask_t cpu_core_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; -EXPORT_SYMBOL(cpu_possible_map); -EXPORT_SYMBOL(cpu_online_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_SYMBOL(cpu_core_map); diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c index 141da375909..9df8f095a8b 100644 --- a/arch/sparc64/kernel/time.c +++ b/arch/sparc64/kernel/time.c @@ -763,7 +763,7 @@ void __devinit setup_sparc64_timer(void) sevt = &__get_cpu_var(sparc64_events); memcpy(sevt, &sparc64_clockevent, sizeof(*sevt)); - sevt->cpumask = cpumask_of_cpu(smp_processor_id()); + sevt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(sevt); } diff --git a/arch/um/include/asm/system.h b/arch/um/include/asm/system.h index 753346e2cdf..ae5f94d6317 100644 --- a/arch/um/include/asm/system.h +++ b/arch/um/include/asm/system.h @@ -11,21 +11,21 @@ extern int get_signals(void); extern void block_signals(void); extern void unblock_signals(void); -#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ +#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \ (flags) = get_signals(); } while(0) -#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ +#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \ set_signals(flags); } while(0) -#define local_irq_save(flags) do { local_save_flags(flags); \ - local_irq_disable(); } while(0) +#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \ + raw_local_irq_disable(); } while(0) -#define local_irq_enable() unblock_signals() -#define local_irq_disable() block_signals() +#define raw_local_irq_enable() unblock_signals() +#define raw_local_irq_disable() block_signals() #define irqs_disabled() \ ({ \ unsigned long flags; \ - local_save_flags(flags); \ + raw_local_save_flags(flags); \ (flags == 0); \ }) diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 04577214284..98351c78bc8 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -25,13 +25,6 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); #include "irq_user.h" #include "os.h" -/* CPU online map, set by smp_boot_cpus */ -cpumask_t cpu_online_map = CPU_MASK_NONE; -cpumask_t cpu_possible_map = CPU_MASK_NONE; - -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(cpu_possible_map); - /* Per CPU bogomips and other parameters * The only piece used here is the ipi pipe, which is set before SMP is * started and never changed. diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 47f04f4a346..b13a87a3ec9 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -50,7 +50,7 @@ static int itimer_next_event(unsigned long delta, static struct clock_event_device itimer_clockevent = { .name = "itimer", .rating = 250, - .cpumask = CPU_MASK_ALL, + .cpumask = cpu_all_mask, .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = itimer_set_mode, .set_next_event = itimer_next_event, diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..0ca2eb7573c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,11 +29,14 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select USER_STACKTRACE_SUPPORT config ARCH_DEFCONFIG string @@ -238,6 +241,25 @@ config X86_HAS_BOOT_CPU_ID def_bool y depends on X86_VOYAGER +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + default y + help + This enables support for sparse irq, esp for msi/msi-x. You may need + if you have lots of cards supports msi-x installed. + + If you don't know what to do here, say Y. + +config NUMA_MIGRATE_IRQ_DESC + bool "Move irq desc when changing irq smp_affinity" + depends on SPARSE_IRQ && SMP + default n + help + This enables moving irq_desc to cpu/node that irq will use handled. + + If you don't know what to do here, say N. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER @@ -367,10 +389,10 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" - depends on X86_32 + depends on X86 help Calculate simpler /proc/<PID>/wchan values. If this option is disabled then wchan values will recurse back to the @@ -465,10 +487,6 @@ config X86_CYCLONE_TIMER def_bool y depends on X86_GENERICARCH -config ES7000_CLUSTERED_APIC - def_bool y - depends on SMP && X86_ES7000 && MPENTIUMIII - source "arch/x86/Kconfig.cpu" config HPET_TIMER @@ -582,19 +600,20 @@ config IOMMU_HELPER config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && BROKEN + depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + select CPUMASK_OFFSTACK default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. config NR_CPUS - int "Maximum number of CPUs (2-512)" if !MAXSMP - range 2 512 - depends on SMP + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 512 if SMP && !MAXSMP + default "1" if !SMP default "4096" if MAXSMP - default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 - default "8" + default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "8" if SMP help This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 512 and the @@ -1632,13 +1651,6 @@ config APM_ALLOW_INTS many of the newer IBM Thinkpads. If you experience hangs when you suspend, try setting this to Y. Otherwise, say N. -config APM_REAL_MODE_POWER_OFF - bool "Use real mode APM BIOS call to power off" - help - Use real mode APM BIOS calls to switch off the computer. This is - a work-around for a number of buggy BIOSes. Switch this option on if - your computer crashes instead of powering off properly. - endif # APM source "arch/x86/kernel/cpu/cpufreq/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe37..85a78575956 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -515,6 +515,7 @@ config CPU_SUP_UMC_32 config X86_DS def_bool X86_PTRACE_BTS depends on X86_DEBUGCTLMSR + select HAVE_HW_BRANCH_TRACER config X86_PTRACE_BTS bool "Branch Trace Store" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..fa013f529b7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -186,14 +186,10 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE_HOOKS - bool - config MMIOTRACE bool "Memory mapped IO tracing" depends on DEBUG_KERNEL && PCI select TRACING - select MMIOTRACE_HOOKS help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3b1510b4fc5..25caa0738af 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -193,6 +193,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h index 1d9543b9d35..976399debb3 100644 --- a/arch/x86/include/asm/bigsmp/apic.h +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -9,12 +9,12 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { #ifdef CONFIG_SMP - return cpu_online_map; + return &cpu_online_map; #else - return cpumask_of_cpu(0); + return &cpumask_of_cpu(0); #endif } @@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target proc */ #define NO_BALANCE_IRQ (0) -#define WAKE_SECONDARY_VIA_INIT - static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -81,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS) + if (mps_cpu < nr_cpu_ids) return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); return BAD_APICID; @@ -96,7 +94,7 @@ extern u8 cpu_2_logical_apicid[]; /* Mapping from cpu number to logical apicid */ static inline int cpu_to_logical_apicid(int cpu) { - if (cpu >= NR_CPUS) + if (cpu >= nr_cpu_ids) return BAD_APICID; return cpu_physical_id(cpu); } @@ -121,16 +119,32 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) } /* As we are using single CPU as destination, pick only one CPU here */ -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int cpu; int apicid; - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); apicid = cpu_to_logical_apicid(cpu); return apicid; } +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_any_and(cpumask, andmask); + if (cpu < nr_cpu_ids) + return cpu_to_logical_apicid(cpu); + + return BAD_APICID; +} + static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h index 9404c535b7e..27fcd01b3ae 100644 --- a/arch/x86/include/asm/bigsmp/ipi.h +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -1,25 +1,22 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a95008457ea..99b6c39774a 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -26,11 +25,18 @@ #include <linux/types.h> #include <linux/init.h> +#include <linux/err.h> #ifdef CONFIG_X86_DS struct task_struct; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); /* * Request BTS or PEBS @@ -38,60 +44,62 @@ struct task_struct; * Due to alignement constraints, the actual buffer may be slightly * smaller than the requested or provided buffer. * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism. * * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; - * NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + * -1 if no interrupt threshold is requested. */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th); /* * Release BTS or PEBS resources * - * Frees buffers allocated on ds_request. - * * Returns 0 on success; -Eerrno otherwise * - * task: the task to release resources for; - * NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern int ds_release_bts(struct bts_tracer *tracer); +extern int ds_release_pebs(struct pebs_tracer *tracer); /* - * Return the (array) index of the write pointer. + * Get the (array) index of the write pointer. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); +extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); /* - * Return the (array) index one record beyond the end of the array. + * Get the (array) index one record beyond the end of the array. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); /* * Provide a pointer to the BTS/PEBS record at parameter index. @@ -102,14 +110,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); * * Returns the size of a single record on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * index: the index of the requested record * record (out): pointer to the requested record */ -extern int ds_access_bts(struct task_struct *task, +extern int ds_access_bts(struct bts_tracer *tracer, size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, +extern int ds_access_pebs(struct pebs_tracer *tracer, size_t index, const void **record); /* @@ -129,38 +136,24 @@ extern int ds_access_pebs(struct task_struct *task, * * Returns the number of bytes written or -Eerrno. * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * buffer: the buffer to write * size: the size of the buffer */ -extern int ds_write_bts(struct task_struct *task, +extern int ds_write_bts(struct bts_tracer *tracer, const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, +extern int ds_write_pebs(struct pebs_tracer *tracer, const void *buffer, size_t size); /* - * Same as ds_write_bts/pebs, but omit ownership checks. - * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. - */ -extern int ds_unchecked_write_bts(struct task_struct *task, - const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, - const void *buffer, size_t size); - -/* * Reset the write pointer of the BTS/PEBS buffer. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer); /* * Clear the BTS/PEBS buffer and reset the write pointer. @@ -168,33 +161,30 @@ extern int ds_reset_pebs(struct task_struct *task); * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern int ds_clear_bts(struct bts_tracer *tracer); +extern int ds_clear_pebs(struct pebs_tracer *tracer); /* * Provide the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value (out): the counter reset value */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); /* * Set the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); /* * Initialization @@ -207,17 +197,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); /* * The DS context - part of struct thread_struct. */ +#define MAX_SIZEOF_DS (12 * 8) + struct ds_context { /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char *ds; + unsigned char ds[MAX_SIZEOF_DS]; /* the owner of the BTS and PEBS configuration, respectively */ - struct task_struct *owner[2]; - /* buffer overflow notification function for BTS and PEBS */ - ds_ovfl_callback_t callback[2]; - /* the original buffer address */ - void *buffer[2]; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages[2]; + struct ds_tracer *owner[2]; /* use count */ unsigned long count; /* a pointer to the context location inside the thread_struct diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 94826cf8745..cc70c1c78ca 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -8,7 +8,9 @@ enum reboot_type { BOOT_BIOS = 'b', #endif BOOT_ACPI = 'a', - BOOT_EFI = 'e' + BOOT_EFI = 'e', + BOOT_CF9 = 'p', + BOOT_CF9_COND = 'q', }; extern enum reboot_type reboot_type; diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 380f0b4f17e..ba8423c5363 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,31 +9,27 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus_cluster(void) { -#if defined CONFIG_ES7000_CLUSTERED_APIC - return CPU_MASK_ALL; -#else - return cpumask_of_cpu(smp_processor_id()); -#endif + return &CPU_MASK_ALL; } -#if defined CONFIG_ES7000_CLUSTERED_APIC -#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define INT_DELIVERY_MODE (dest_LowestPrio) -#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ -#define NO_BALANCE_IRQ (1) -#undef WAKE_SECONDARY_VIA_INIT -#define WAKE_SECONDARY_VIA_MIP -#else +static inline const cpumask_t *target_cpus(void) +{ + return &cpumask_of_cpu(smp_processor_id()); +} + +#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio) +#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ_CLUSTER (1) + #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target procs */ #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#define WAKE_SECONDARY_VIA_INIT -#endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -60,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ +static inline void init_apic_ldr_cluster(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + static inline void init_apic_ldr(void) { unsigned long val; @@ -70,17 +76,14 @@ static inline void init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifndef CONFIG_X86_GENERICARCH -extern void enable_apic_mode(void); -#endif - extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); + "Physical Cluster" : "Logical Cluster", + nr_ioapics, cpus_addr(*target_cpus())[0]); } static inline int multi_timer_check(int apic, int irq) @@ -98,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) return boot_cpu_physical_apicid; - else if (mps_cpu < NR_CPUS) + else if (mps_cpu < nr_cpu_ids) return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; @@ -118,9 +121,9 @@ extern u8 cpu_2_logical_apicid[]; static inline int cpu_to_logical_apicid(int cpu) { #ifdef CONFIG_SMP - if (cpu >= NR_CPUS) - return BAD_APICID; - return (int)cpu_2_logical_apicid[cpu]; + if (cpu >= nr_cpu_ids) + return BAD_APICID; + return (int)cpu_2_logical_apicid[cpu]; #else return logical_smp_processor_id(); #endif @@ -144,16 +147,87 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid) return (1); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int +cpu_mask_to_apicid_cluster(const struct cpumask *cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpumask_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) + return 0xFF; + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = cpumask_first(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpumask_test_cpu(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); + return 0xFF; + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpus_weight(*cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) + return cpu_to_logical_apicid(0); + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(*cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, *cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); + return cpu_to_logical_apicid(0); + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int num_bits_set; + int num_bits_set2; + int cpus_found = 0; + int cpu; + int apicid = 0; + + num_bits_set = cpumask_weight(cpumask); + num_bits_set2 = cpumask_weight(andmask); + num_bits_set = min(num_bits_set, num_bits_set2); + /* Return id to all */ + if (num_bits_set >= nr_cpu_ids) #if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; #else @@ -163,14 +237,17 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first_and(cpumask, andmask); apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpumask_test_cpu(cpu, cpumask) && + cpumask_test_cpu(cpu, andmask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != - apicid_cluster(new_apicid)){ - printk ("%s: Not a valid mask!\n", __func__); + apicid_cluster(new_apicid)) { + printk(KERN_WARNING + "%s: Not a valid mask!\n", __func__); #if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; #else diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h index 632a955fcc0..7e8ed24d4b8 100644 --- a/arch/x86/include/asm/es7000/ipi.h +++ b/arch/x86/include/asm/es7000/ipi.h @@ -1,24 +1,22 @@ #ifndef __ASM_ES7000_IPI_H #define __ASM_ES7000_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_ES7000_IPI_H */ diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h index 39849346191..78f0daaee43 100644 --- a/arch/x86/include/asm/es7000/wakecpu.h +++ b/arch/x86/include/asm/es7000/wakecpu.h @@ -1,36 +1,12 @@ #ifndef __ASM_ES7000_WAKECPU_H #define __ASM_ES7000_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#ifdef CONFIG_ES7000_CLUSTERED_APIC -#define WAKE_SECONDARY_VIA_MIP -#else -#define WAKE_SECONDARY_VIA_INIT -#endif - -#ifdef WAKE_SECONDARY_VIA_MIP -extern int es7000_start_cpu(int cpu, unsigned long eip); -static inline int -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) -{ - int boot_error = 0; - boot_error = es7000_start_cpu(phys_apicid, start_eip); - return boot_error; -} -#endif - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW 0x467 +#define TRAMPOLINE_PHYS_HIGH 0x469 static inline void wait_for_init_deassert(atomic_t *deassert) { -#ifdef WAKE_SECONDARY_VIA_INIT +#ifndef CONFIG_ES7000_CLUSTERED_APIC while (!atomic_read(deassert)) cpu_relax(); #endif @@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* __ASM_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9e8bc29b8b1..7e61b4ceb9a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,8 +17,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } -#endif +#ifdef CONFIG_DYNAMIC_FTRACE + +struct dyn_arch_ftrace { + /* No extra data needed for x86 */ +}; + +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { + unsigned long ret; + unsigned long func; + unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry32.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 5cbd4fcc06f..746f37a7963 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -2,6 +2,7 @@ #define _ASM_X86_GENAPIC_32_H #include <asm/mpspec.h> +#include <asm/atomic.h> /* * Generic APIC driver interface. @@ -23,7 +24,7 @@ struct genapic { int (*probe)(void); int (*apic_id_registered)(void); - cpumask_t (*target_cpus)(void); + const struct cpumask *(*target_cpus)(void); int int_delivery_mode; int int_dest_mode; int ESR_DISABLE; @@ -56,15 +57,27 @@ struct genapic { unsigned (*get_apic_id)(unsigned long x); unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); - cpumask_t (*vector_allocation_domain)(int cpu); + unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); + unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); #ifdef CONFIG_SMP /* ipi */ - void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *mask, + int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif + int (*wakeup_cpu)(int apicid, unsigned long start_eip); + int trampoline_phys_low; + int trampoline_phys_high; + void (*wait_for_init_deassert)(atomic_t *deassert); + void (*smp_callin_clear_local_apic)(void); + void (*store_NMI_vector)(unsigned short *high, unsigned short *low); + void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); + void (*inquire_remote_apic)(int apicid); }; #define APICFUNC(x) .x = x, @@ -105,16 +118,25 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ - APICFUNC(vector_allocation_domain) \ + APICFUNC(cpu_mask_to_apicid_and) \ + APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ IPIFUNC(send_IPI_allbutself) \ IPIFUNC(send_IPI_all) \ APICFUNC(enable_apic_mode) \ APICFUNC(phys_pkg_id) \ + .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \ + .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \ + APICFUNC(wait_for_init_deassert) \ + APICFUNC(smp_callin_clear_local_apic) \ + APICFUNC(store_NMI_vector) \ + APICFUNC(restore_NMI_vector) \ + APICFUNC(inquire_remote_apic) \ } extern struct genapic *genapic; +extern void es7000_update_genapic_to_cluster(void); enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; #define get_uv_system_type() UV_NONE diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h index 13c4e96199e..adf32fb56aa 100644 --- a/arch/x86/include/asm/genapic_64.h +++ b/arch/x86/include/asm/genapic_64.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_GENAPIC_64_H #define _ASM_X86_GENAPIC_64_H +#include <linux/cpumask.h> + /* * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 @@ -18,20 +20,26 @@ struct genapic { u32 int_delivery_mode; u32 int_dest_mode; int (*apic_id_registered)(void); - cpumask_t (*target_cpus)(void); - cpumask_t (*vector_allocation_domain)(int cpu); + const struct cpumask *(*target_cpus)(void); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); /* ipi */ - void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *mask, + int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); /* */ - unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); + unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask); unsigned int (*phys_pkg_id)(int index_msb); unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; + /* wakeup_secondary_cpu */ + int (*wakeup_cpu)(int apicid, unsigned long start_eip); }; extern struct genapic *genapic; diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 6afd9933a7d..25d527ca136 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -188,17 +188,14 @@ extern void restore_IO_APIC_setup(void); extern void reinit_intr_remapped_IO_APIC(int); #endif -extern int probe_nr_irqs(void); +extern void probe_nr_irqs_gsi(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_init_mappings(void) { } -static inline int probe_nr_irqs(void) -{ - return NR_IRQS; -} +static inline void probe_nr_irqs_gsi(void) { } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index f89dffb28aa..c745a306f7d 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector, native_apic_mem_write(APIC_ICR, cfg); } -static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) +static inline void send_IPI_mask_sequence(const struct cpumask *mask, + int vector) { unsigned long flags; unsigned long query_cpu; @@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) * - mbligh */ local_irq_save(flags); - for_each_cpu_mask_nr(query_cpu, mask) { + for_each_cpu(query_cpu, mask) { __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } +static inline void send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __send_IPI_dest_field( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + #endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index bae0eda9548..8766d30fb74 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -37,7 +37,7 @@ extern int irqbalance_disable(char *str); #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> -extern void fixup_irqs(cpumask_t map); +extern void fixup_irqs(void); #endif extern unsigned int do_IRQ(struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f94..f7ff65032b9 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -101,12 +101,23 @@ #define LAST_VM86_IRQ 15 #define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) +#define NR_IRQS_LEGACY 16 + #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) + +#ifndef CONFIG_SPARSE_IRQ # if NR_CPUS < MAX_IO_APICS # define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) # else # define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) # endif +#else +# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) +# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) +# else +# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +# endif +#endif #elif defined(CONFIG_X86_VOYAGER) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a1f22771a15..c61d8b2ab8b 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -5,21 +5,8 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 -# define VA_PGD 3 -# define PA_PTE_0 4 -# define VA_PTE_0 5 -# define PA_PTE_1 6 -# define VA_PTE_1 7 -# define PA_SWAP_PAGE 8 -# ifdef CONFIG_X86_PAE -# define PA_PMD_0 9 -# define VA_PMD_0 10 -# define PA_PMD_1 11 -# define VA_PMD_1 12 -# define PAGES_NR 13 -# else -# define PAGES_NR 9 -# endif +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 @@ -170,6 +157,20 @@ relocate_kernel(unsigned long indirection_page, unsigned long start_address) ATTRIB_NORET; #endif +#ifdef CONFIG_X86_32 +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + pgd_t *pgd; +#ifdef CONFIG_X86_PAE + pmd_t *pmd0; + pmd_t *pmd1; +#endif + pte_t *pte0; + pte_t *pte1; +}; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index ff3a6c236c0..8863d978cb9 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -8,12 +8,12 @@ #define APIC_DFR_VALUE (APIC_DFR_FLAT) -static inline cpumask_t target_cpus(void) +static inline const struct cpumask *target_cpus(void) { #ifdef CONFIG_SMP - return cpu_online_map; + return cpu_online_mask; #else - return cpumask_of_cpu(0); + return cpumask_of(0); #endif } @@ -28,15 +28,18 @@ static inline cpumask_t target_cpus(void) #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) #define phys_pkg_id (genapic->phys_pkg_id) #define vector_allocation_domain (genapic->vector_allocation_domain) #define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) #define send_IPI_self (genapic->send_IPI_self) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void setup_apic_routing(void); #else #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define TARGET_CPUS (target_cpus()) +#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init /* * Set up the logical destination ID. * @@ -59,9 +62,18 @@ static inline int apic_id_registered(void) return physid_isset(read_apic_id(), phys_cpu_present_map); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask) { - return cpus_addr(cpumask)[0]; + return cpumask_bits(cpumask)[0]; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + unsigned long mask1 = cpumask_bits(cpumask)[0]; + unsigned long mask2 = cpumask_bits(andmask)[0]; + + return (unsigned int)(mask1 & mask2); } static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) @@ -86,7 +98,7 @@ static inline int apicid_to_node(int logical_apicid) #endif } -static inline cpumask_t vector_allocation_domain(int cpu) +static inline void vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -96,8 +108,7 @@ static inline cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; } #endif @@ -129,7 +140,7 @@ static inline int cpu_to_logical_apicid(int cpu) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS && cpu_present(mps_cpu)) + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h index fabca01ebac..191312d155d 100644 --- a/arch/x86/include/asm/mach-default/mach_ipi.h +++ b/arch/x86/include/asm/mach-default/mach_ipi.h @@ -4,7 +4,8 @@ /* Avoid include hell */ #define NMI_VECTOR 0x02 -void send_IPI_mask_bitmask(cpumask_t mask, int vector); +void send_IPI_mask_bitmask(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); void __send_IPI_shortcut(unsigned int shortcut, int vector); extern int no_broadcast; @@ -12,28 +13,27 @@ extern int no_broadcast; #ifdef CONFIG_X86_64 #include <asm/genapic.h> #define send_IPI_mask (genapic->send_IPI_mask) +#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself) #else -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_bitmask(mask, vector); } +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); #endif static inline void __local_send_IPI_allbutself(int vector) { - if (no_broadcast || vector == NMI_VECTOR) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); - send_IPI_mask(mask, vector); - } else + if (no_broadcast || vector == NMI_VECTOR) + send_IPI_mask_allbutself(cpu_online_mask, vector); + else __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } static inline void __local_send_IPI_all(int vector) { if (no_broadcast || vector == NMI_VECTOR) - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); else __send_IPI_shortcut(APIC_DEST_ALLINC, vector); } diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h index 9d80db91e99..ceb01366014 100644 --- a/arch/x86/include/asm/mach-default/mach_wakecpu.h +++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h @@ -1,17 +1,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H #define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#define WAKE_SECONDARY_VIA_INIT - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW (0x467) +#define TRAMPOLINE_PHYS_HIGH (0x469) static inline void wait_for_init_deassert(atomic_t *deassert) { @@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h index dbab36d64d4..23bf52103b8 100644 --- a/arch/x86/include/asm/mach-default/smpboot_hooks.h +++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h @@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) CMOS_WRITE(0xa, 0xf); local_flush_tlb(); pr_debug("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; pr_debug("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; pr_debug("3.\n"); } @@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void) */ CMOS_WRITE(0, 0xf); - *((volatile long *) phys_to_virt(0x467)) = 0; + *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h index 5180bd7478f..48553e958ad 100644 --- a/arch/x86/include/asm/mach-generic/mach_apic.h +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -24,9 +24,11 @@ #define check_phys_apicid_present (genapic->check_phys_apicid_present) #define check_apicid_used (genapic->check_apicid_used) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) #define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void generic_bigsmp_probe(void); diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h new file mode 100644 index 00000000000..1ab16b168c8 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H + +#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low) +#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high) +#define wait_for_init_deassert (genapic->wait_for_init_deassert) +#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic) +#define store_NMI_vector (genapic->store_NMI_vector) +#define restore_NMI_vector (genapic->restore_NMI_vector) +#define inquire_remote_apic (genapic->inquire_remote_apic) + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h index 0bf2a06b7a4..c80f00d2996 100644 --- a/arch/x86/include/asm/numaq/apic.h +++ b/arch/x86/include/asm/numaq/apic.h @@ -7,9 +7,9 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { - return CPU_MASK_ALL; + return &CPU_MASK_ALL; } #define NO_BALANCE_IRQ (1) @@ -122,7 +122,13 @@ static inline void enable_apic_mode(void) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +{ + return (int) 0xF; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { return (int) 0xF; } diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h index 935588d286c..a8374c65277 100644 --- a/arch/x86/include/asm/numaq/ipi.h +++ b/arch/x86/include/asm/numaq/ipi.h @@ -1,25 +1,22 @@ #ifndef __ASM_NUMAQ_IPI_H #define __ASM_NUMAQ_IPI_H -void send_IPI_mask_sequence(cpumask_t, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_NUMAQ_IPI_H */ diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h index c577bda5b1c..6f499df8edd 100644 --- a/arch/x86/include/asm/numaq/wakecpu.h +++ b/arch/x86/include/asm/numaq/wakecpu.h @@ -3,12 +3,8 @@ /* This file copes with machines that wakeup secondary CPUs by NMIs */ -#define WAKE_SECONDARY_VIA_NMI - -#define TRAMPOLINE_LOW phys_to_virt(0x8) -#define TRAMPOLINE_HIGH phys_to_virt(0xa) - -#define boot_cpu_apicid boot_cpu_logical_apicid +#define TRAMPOLINE_PHYS_LOW (0x8) +#define TRAMPOLINE_PHYS_HIGH (0xa) /* We don't do anything here because we use NMI's to boot instead */ static inline void wait_for_init_deassert(atomic_t *deassert) @@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void) static inline void store_NMI_vector(unsigned short *high, unsigned short *low) { printk("Storing NMI vector\n"); - *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); - *low = *((volatile unsigned short *) TRAMPOLINE_LOW); + *high = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)); + *low = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)); } static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { printk("Restoring NMI vector\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; - *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + *high; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + *low; } -#define inquire_remote_apic(apicid) {} +static inline void inquire_remote_apic(int apicid) +{ +} #endif /* __ASM_NUMAQ_WAKECPU_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index df7710354f8..562d4fd31ba 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_REBOOT_H #define _ASM_X86_REBOOT_H +#include <linux/kdebug.h> + struct pt_regs; struct machine_ops { @@ -18,4 +20,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); void machine_real_restart(const unsigned char *code, int length); +typedef void (*nmi_shootdown_cb)(int, struct die_args*); +void nmi_shootdown_cpus(nmi_shootdown_cb callback); + #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d3723746..294daeb3a00 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,6 +16,8 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif +extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip); /* * Any setup quirks to be performed? */ @@ -39,6 +41,7 @@ struct x86_quirks { void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); int (*setup_ioapic_ids)(void); + int (*update_genapic)(void); }; extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index d12811ce51d..830b9fcb642 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -60,7 +60,7 @@ struct smp_ops { void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); - void (*send_call_func_ipi)(cpumask_t mask); + void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); }; @@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu) static inline void arch_send_call_function_ipi(cpumask_t mask) { - smp_ops.send_call_func_ipi(mask); + smp_ops.send_call_func_ipi(&mask); } void cpu_disable_common(void); @@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); -void native_send_call_func_ipi(cpumask_t mask); +void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); extern void prefill_possible_map(void); diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h index 9b3070f1c2a..651a9384934 100644 --- a/arch/x86/include/asm/summit/apic.h +++ b/arch/x86/include/asm/summit/apic.h @@ -14,13 +14,13 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { /* CPU_MASK_ALL (0xff) has undefined behaviour with * dest_LowestPrio mode logical clustered apic interrupt routing * Just start on cpu 0. IRQ balancing will spread load */ - return cpumask_of_cpu(0); + return &cpumask_of_cpu(0); } #define INT_DELIVERY_MODE (dest_LowestPrio) @@ -137,14 +137,14 @@ static inline void enable_apic_mode(void) { } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpus_weight(*cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) return (int) 0xFF; @@ -152,10 +152,10 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpu_isset(cpu, *cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ @@ -170,6 +170,45 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) return apicid; } +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int num_bits_set; + int num_bits_set2; + int cpus_found = 0; + int cpu; + int apicid = 0; + + num_bits_set = cpumask_weight(cpumask); + num_bits_set2 = cpumask_weight(andmask); + num_bits_set = min(num_bits_set, num_bits_set2); + /* Return id to all */ + if (num_bits_set >= nr_cpu_ids) + return 0xFF; + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = cpumask_first_and(cpumask, andmask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpumask_test_cpu(cpu, cpumask) + && cpumask_test_cpu(cpu, andmask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)) { + printk(KERN_WARNING + "%s: Not a valid mask!\n", __func__); + return 0xFF; + } + apicid = apicid | new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + /* cpuid returns the value latched in the HW at reset, not the APIC ID * register's value. For any box whose BIOS changes APIC IDs, like * clustered APIC systems, we must use hard_smp_processor_id. diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h index 53bd1e7bd7b..a8a2c24f50c 100644 --- a/arch/x86/include/asm/summit/ipi.h +++ b/arch/x86/include/asm/summit/ipi.h @@ -1,9 +1,10 @@ #ifndef __ASM_SUMMIT_IPI_H #define __ASM_SUMMIT_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const cpumask_t *mask, int vector); +void send_IPI_mask_allbutself(const cpumask_t *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const cpumask_t *mask, int vector) { send_IPI_mask_sequence(mask, vector); } @@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector) cpu_clear(smp_processor_id(), mask); if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask(&mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(&cpu_online_map, vector); } #endif /* __ASM_SUMMIT_IPI_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff..07c3e404899 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); +void stop_this_cpu(void *dummy); + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..0921b4018c1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,6 +20,8 @@ struct task_struct; struct exec_domain; #include <asm/processor.h> +#include <asm/ftrace.h> +#include <asm/atomic.h> struct thread_info { struct task_struct *task; /* main task structure */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index ff386ff50ed..79e31e9dcdd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -226,6 +226,8 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) +#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) /* indicates that pointers to the topology cpumask_t maps are valid */ #define arch_provides_topology_pointers yes diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c54921b2e..99192bb55a5 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -157,6 +157,7 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ + might_fault(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -241,6 +242,7 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ + might_fault(); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index d095a3aeea1..5e06259e90e 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - return __copy_to_user_inatomic(to, from, n); + might_fault(); + return __copy_to_user_inatomic(to, from, n); } static __always_inline unsigned long @@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { - might_sleep(); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; @@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { - might_sleep(); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f8cfd00db45..84210c479fc 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -29,6 +29,8 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; + + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -71,6 +73,8 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; + + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -113,6 +117,8 @@ static __always_inline __must_check int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; + + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b62a7667828..1cad9318d21 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,7 @@ CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o @@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4c51a2f8fd3..65d0b72777e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void) disable_acpi(); } } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); #endif return; } diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..edda4c00e3d 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -141,7 +141,7 @@ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); +static void lapic_timer_broadcast(const cpumask_t *mask); static void apic_pm_activate(void); /* @@ -453,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, /* * Local APIC timer broadcast function */ -static void lapic_timer_broadcast(cpumask_t mask) +static void lapic_timer_broadcast(const cpumask_t *mask) { #ifdef CONFIG_SMP send_IPI_mask(mask, LOCAL_TIMER_VECTOR); @@ -469,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void) struct clock_event_device *levt = &__get_cpu_var(lapic_events); memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); + levt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(levt); } @@ -1903,8 +1903,8 @@ void __cpuinit generic_processor_info(int apicid, int version) } #endif - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); } #ifdef CONFIG_X86_64 @@ -2106,7 +2106,7 @@ __cpuinit int apic_is_clustered_box(void) bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { /* are we being called early in kernel startup? */ if (bios_cpu_apicid) { id = bios_cpu_apicid[i]; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5145a6e72bb..3a26525a3f3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -391,11 +391,7 @@ static int power_off; #else static int power_off = 1; #endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else static int realmode_power_off; -#endif #ifdef CONFIG_APM_ALLOW_INTS static int allow_ints = 1; #else diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8e48c5d4467..88ea02dcb62 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/ftrace.h> #include <linux/acpi.h> #include <acpi/processor.h> @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; + struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } + trace_power_mark(&it, POWER_PSTATE, next_perf_state); + switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d5..816f27f289b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P4); if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); +#endif if (cpu_has_bts) ptrace_bts_init_intel(c); -#endif - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3f46afbb1cf..fb7f946cb65 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(cpuid4_info, cpu) = NULL; } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static void get_cpu_leaves(void *_retval) { - struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; - cpumask_t oldmask; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) - return -ENOMEM; - - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (retval) - goto out; + int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { + struct _cpuid4_info *this_leaf; this_leaf = CPUID4_INFO_IDX(cpu, j); - retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) { + *retval = cpuid4_cache_lookup(j, this_leaf); + if (unlikely(*retval < 0)) { int i; for (i = 0; i < j; i++) @@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) } cache_shared_cpu_map_setup(cpu, j); } - set_cpus_allowed_ptr(current, &oldmask); +} + +static int __cpuinit detect_cache_attributes(unsigned int cpu) +{ + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; + + per_cpu(cpuid4_info, cpu) = kzalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (per_cpu(cpuid4_info, cpu) == NULL) + return -ENOMEM; -out: + smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { kfree(per_cpu(cpuid4_info, cpu)); per_cpu(cpuid4_info, cpu) = NULL; @@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, cpumask_t *mask = &this_leaf->shared_cpu_map; n = type? - cpulist_scnprintf(buf, len-2, *mask): - cpumask_scnprintf(buf, len-2, *mask); + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; buf[n] = '\0'; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 5eb390a4b2e..a1de80f368f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ * CPU Initialization */ +struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; +}; + /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_block *b, - int reset, u16 old_limit) +static long threshold_restart_bank(void *_tr) { + struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; - rdmsr(b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - reset = 1; /* limit cannot be lower than err count */ + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ - if (reset) { /* reset err count and overflow bit */ + if (tr->reset) { /* reset err count and overflow bit */ mci_misc_hi = (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - b->threshold_limit); - } else if (old_limit) { /* change limit w/o reset */ + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (old_limit - b->threshold_limit); + (tr->old_limit - tr->b->threshold_limit); mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - b->interrupt_enable ? + tr->b->interrupt_enable ? (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(b->address, mci_misc_lo, mci_misc_hi); + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int cpu = smp_processor_id(); u8 lvt_off; u32 low = 0, high = 0, address = 0; + struct thresh_restart tr; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) wrmsr(address, low, high); threshold_defaults.address = address; - threshold_restart_bank(&threshold_defaults, 0, 0); + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); } } } @@ -251,20 +262,6 @@ struct threshold_attr { ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static void affinity_set(unsigned int cpu, cpumask_t *oldmask, - cpumask_t *newmask) -{ - *oldmask = current->cpus_allowed; - cpus_clear(*newmask); - cpu_set(cpu, *newmask); - set_cpus_allowed_ptr(current, newmask); -} - -static void affinity_restore(const cpumask_t *oldmask) -{ - set_cpus_allowed_ptr(current, oldmask); -} - #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ { \ @@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; b->interrupt_enable = !!new; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, 0); - affinity_restore(&oldmask); + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } @@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; - u16 old; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; @@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b, new = THRESHOLD_MAX; if (new < 1) new = 1; - old = b->threshold_limit; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; + tr.b = b; + tr.reset = 0; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, old); - affinity_restore(&oldmask); + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } -static ssize_t show_error_count(struct threshold_block *b, char *buf) +static long local_error_count(void *_b) { - u32 high, low; - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); + struct threshold_block *b = _b; + u32 low, high; + rdmsr(b->address, low, high); - affinity_restore(&oldmask); - return sprintf(buf, "%x\n", - (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); + return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); } static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 1, 0); - affinity_restore(&oldmask); + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return 1; } @@ -463,12 +462,19 @@ out_free: return err; } +static long local_allocate_threshold_blocks(void *_bank) +{ + unsigned int *bank = _bank; + + return allocate_threshold_blocks(smp_processor_id(), *bank, 0, + MSR_IA32_MC0_MISC + *bank * 4); +} + /* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - cpumask_t oldmask, newmask; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - affinity_set(cpu, &oldmask, &newmask); - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(&oldmask); - + err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); if (err) goto out_free; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 26855381790..d84a852e4cd 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,34 +29,17 @@ #include <mach_ipi.h> -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) -static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) +static void kdump_nmi_callback(int cpu, struct die_args *args) { struct pt_regs *regs; #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; #endif - int cpu; - if (val != DIE_NMI_IPI) - return NOTIFY_OK; - - regs = ((struct die_args *)data)->regs; - cpu = raw_smp_processor_id(); - - /* Don't do anything if this handler is invoked on crashing cpu. - * Otherwise, system will completely hang. Crashing cpu can get - * an NMI if system was initially booted with nmi_watchdog parameter. - */ - if (cpu == crashing_cpu) - return NOTIFY_STOP; - local_irq_disable(); + regs = args->regs; #ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { @@ -65,54 +48,19 @@ static int crash_nmi_callback(struct notifier_block *self, } #endif crash_save_cpu(regs, cpu); - disable_local_APIC(); - atomic_dec(&waiting_for_crash_ipi); - /* Assume hlt works */ - halt(); - for (;;) - cpu_relax(); - - return 1; -} -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); + disable_local_APIC(); } -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, -}; - -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { - unsigned long msecs; - - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_die_notifier(&crash_nmi_nb)) - return; /* return what? */ - /* Ensure the new callback function is set before sending - * out the NMI - */ - wmb(); + nmi_shootdown_cpus(kdump_nmi_callback); - smp_send_nmi_allbutself(); - - msecs = 1000; /* Wait at most a second for the other cpus to stop */ - while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { - mdelay(1); - msecs--; - } - - /* Leave the nmi callback set */ disable_local_APIC(); } + #else -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { /* There are no cpus to shootdown */ } @@ -131,9 +79,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) /* The kernel is broken so disable interrupts */ local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = safe_smp_processor_id(); - nmi_shootdown_cpus(); + kdump_nmi_shootdown_cpus(); lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index a2d1176c38e..19a8c2c0389 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -28,6 +27,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/kernel.h> /* @@ -44,6 +44,33 @@ struct ds_configuration { }; static struct ds_configuration ds_cfg; +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -107,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ -/* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. - */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); /* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. + * Locking is done only for allocating BTS or PEBS resources. */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); /* @@ -183,51 +189,13 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. */ static DEFINE_PER_CPU(struct ds_context *, system_context); #define this_system_context per_cpu(system_context, smp_processor_id()) -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. - */ static inline struct ds_context *ds_get_context(struct task_struct *task) { - struct ds_context *context; - unsigned long irq; - - spin_lock_irqsave(&ds_lock, irq); - - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; - - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} - -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) -{ struct ds_context **p_context = (task ? &task->thread.ds_ctx : &this_system_context); struct ds_context *context = *p_context; @@ -238,16 +206,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) if (!context) return NULL; - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } - spin_lock_irqsave(&ds_lock, irq); if (*p_context) { - kfree(context->ds); kfree(context); context = *p_context; @@ -272,10 +233,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -296,13 +253,6 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); - - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); out: spin_unlock_irqrestore(&ds_lock, irq); @@ -312,345 +262,342 @@ static inline void ds_put_context(struct ds_context *context) /* * Handle a buffer overflow * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) +{ + switch (qual) { + case ds_bts: { + struct bts_tracer *tracer = + container_of(context->owner[qual], + struct bts_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + case ds_pebs: { + struct pebs_tracer *tracer = + container_of(context->owner[qual], + struct pebs_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + } } -/* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. - * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. - * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved - */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static void ds_install_ds_config(struct ds_context *context, + enum ds_qualifier qual, + void *base, size_t size, size_t ith) { - unsigned long rlim, vm, pgsz; - void *buffer; - - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - return NULL; + unsigned long buffer, adj; - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - return NULL; + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - return NULL; + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; + buffer += adj; + size -= adj; - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; - if (pages) - *pages = pgsz; + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); - return buffer; + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size - ith); } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, + struct task_struct *task, + void *base, size_t size, size_t th) { struct ds_context *context; - unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); unsigned long irq; - int error = 0; + int error; + error = -EOPNOTSUPP; if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; + goto out; + + error = -EINVAL; + if (!base) + goto out; /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) - return -EINVAL; + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; + + error = -EINVAL; + if (size <= th) + goto out; + } + tracer->buffer = base; + tracer->size = size; - context = ds_alloc_context(task); + error = -ENOMEM; + context = ds_get_context(task); if (!context) - return -ENOMEM; + goto out; + tracer->context = context; + spin_lock_irqsave(&ds_lock, irq); error = -EPERM; if (!check_tracer(task)) goto out_unlock; - get_tracer(task); - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; error = -EPERM; - if (context->owner[qual] != NULL) + if (context->owner[qual]) goto out_put_tracer; - context->owner[qual] = current; + context->owner[qual] = tracer; spin_unlock_irqrestore(&ds_lock, irq); - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - - context->buffer[qual] = base; - } - error = 0; + ds_install_ds_config(context, qual, base, size, th); - context->callback[qual] = ovfl; - - /* adjust the buffer address and size to meet alignment - * constraints: - * - buffer is double-word aligned - * - size is multiple of record size - * - * We checked the size at the very beginning; we have enough - * space to do the adjustment. - */ - buffer = (unsigned long)base; - - adj = ALIGN(buffer, alignment) - buffer; - buffer += adj; - size -= adj; - - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); - - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); - - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; + return 0; out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); put_tracer(task); - return error; - out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(context); + tracer->context = NULL; + out: return error; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th) { - return ds_request(task, base, size, ovfl, ds_bts); -} + struct bts_tracer *tracer; + int error; -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; + + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; + + error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + if (error < 0) + goto out_tracer; + + return tracer; + + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); } -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th) { - struct ds_context *context; + struct pebs_tracer *tracer; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; + error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + if (error < 0) + goto out_tracer; - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); + return tracer; + + out_tracer: + kfree(tracer); out: - ds_put_context(context); - return error; + return ERR_PTR(error); } -int ds_release_bts(struct task_struct *task) +static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - return ds_release(task, ds_bts); + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; + + put_tracer(tracer->context->task); + ds_put_context(tracer->context); } -int ds_release_pebs(struct task_struct *task) +int ds_release_bts(struct bts_tracer *tracer) { - return ds_release(task, ds_pebs); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_bts); + kfree(tracer); + + return 0; } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +int ds_release_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - unsigned long base, index; - int error; + if (!tracer) + return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + ds_release(&tracer->ds, ds_pebs); + kfree(tracer); + + return 0; +} + +static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) +{ + unsigned long base, index; base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) +int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; - unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + unsigned long base, max; base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + max = ds_get(context->ds, qual, ds_absolute_maximum); - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (max - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_end(struct task_struct *task, size_t *pos) +int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_end(struct task_struct *task, size_t *pos) +int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) +static int ds_access(struct ds_context *context, enum ds_qualifier qual, + size_t index, const void **record) { - struct ds_context *context; unsigned long base, idx; - int error; if (!record) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - base = ds_get(context->ds, qual, ds_buffer_base); idx = base + (index * ds_cfg.sizeof_rec[qual]); - error = -EINVAL; if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; + return -EINVAL; *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; + + return ds_cfg.sizeof_rec[qual]; } -int ds_access_bts(struct task_struct *task, size_t index, const void **record) +int ds_access_bts(struct bts_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_bts); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_bts, index, record); } -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) +int ds_access_pebs(struct pebs_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_pebs); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_pebs, index, record); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - struct ds_context *context; - int error; + int bytes_written = 0; if (!record) return -EINVAL; - error = -EPERM; - context = ds_get_context(task); - if (!context) - goto out; - - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } - - error = 0; while (size) { unsigned long base, index, end, write_end, int_th; unsigned long write_size, adj_write_size; @@ -678,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, write_end = end; if (write_end <= index) - goto out; + break; write_size = min((unsigned long) size, write_end - index); memcpy((void *)index, record, write_size); record = (const char *)record + write_size; - size -= write_size; - error += write_size; + size -= write_size; + bytes_written += write_size; adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; @@ -700,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, ds_set(context->ds, qual, ds_index, index); if (index >= int_th) - ds_overflow(task, context, qual); + ds_overflow(context, qual); } - out: - ds_put_context(context); - return error; + return bytes_written; } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + if (!tracer) + return -EINVAL; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); + return ds_write(tracer->ds.context, ds_bts, record, size); } -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) +int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + if (!tracer) + return -EINVAL; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + return ds_write(tracer->ds.context, ds_pebs, record, size); } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +static void ds_reset_or_clear(struct ds_context *context, + enum ds_qualifier qual, int clear) { - struct ds_context *context; unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); end = ds_get(context->ds, qual, ds_absolute_maximum); @@ -749,70 +681,69 @@ static int ds_reset_or_clear(struct task_struct *task, memset((void *)base, 0, end - base); ds_set(context->ds, qual, ds_index, base); - - error = 0; - out: - ds_put_context(context); - return error; } -int ds_reset_bts(struct task_struct *task) +int ds_reset_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + + return 0; } -int ds_reset_pebs(struct task_struct *task) +int ds_reset_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + + return 0; } -int ds_clear_bts(struct task_struct *task) +int ds_clear_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); + + return 0; } -int ds_clear_pebs(struct task_struct *task) +int ds_clear_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + + return 0; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; if (!value) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); - - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) { - struct ds_context *context; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + if (!tracer) + return -EINVAL; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; - error = 0; - out: - ds_put_context(context); - return error; + return 0; } static const struct ds_configuration ds_cfg_var = { @@ -840,6 +771,10 @@ static inline void ds_configure(const struct ds_configuration *cfg) { ds_cfg = *cfg; + + printk(KERN_INFO "DS available\n"); + + BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -847,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ ds_configure(&ds_cfg_var); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ + default: /* Core2, Atom, ... */ ds_configure(&ds_cfg_64); break; - default: - /* sorry, don't know about them */ - break; } break; case 0xF: @@ -884,6 +818,8 @@ void ds_free(struct ds_context *context) * is dying. There should not be any user of that context left * to disturb us, anymore. */ unsigned long leftovers = context->count; - while (leftovers--) + while (leftovers--) { + put_tracer(context->task); ds_put_context(context); + } } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 00000000000..6b1f6f6f866 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#include "dumpstack.h" + +int panic_on_unrecovered_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task = tinfo->task; + unsigned long ret_addr; + int index = task->curr_ret_stack; + + if (addr != (unsigned long)return_to_handler) + return; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + stack++; + } + return bp; +} + + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 + unsigned short ss; + unsigned long sp; +#endif + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); +#ifdef CONFIG_X86_32 + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); +#endif + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (!user_mode_vm(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h new file mode 100644 index 00000000000..da87590b869 --- /dev/null +++ b/arch/x86/kernel/dumpstack.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + +#ifndef DUMPSTACK_H +#define DUMPSTACK_H + +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; +extern int kstack_depth_to_print; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; +#endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197..d593cd1f58d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,69 +17,14 @@ #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} +#include "dumpstack.h" void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + int graph = 0; + if (!task) task = current; @@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL); + bp = print_context_stack(context, stack, bp, ops, + data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { @@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} - -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - bust_spinlocks(0); - do_exit(SIGSEGV); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a..c302d070704 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,19 +17,7 @@ #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} +#include "dumpstack.h" static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; + int graph = 0; if (!task) task = current; @@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); + data, estack_end, &graph); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); + ops, data, irqstack_end, &graph); /* * We link to the next stack (which would be * the process stack normally) the last @@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - void show_registers(struct pt_regs *regs) { int i; @@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca..43ceb3f454b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1157,6 +1157,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1171,6 +1174,11 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -1180,8 +1188,18 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif .globl ftrace_stub ftrace_stub: ret @@ -1200,12 +1218,43 @@ trace: popl %edx popl %ecx popl %eax - jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a..54e0bbdccb9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -68,6 +68,8 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub /* taken from glibc */ subq $0x38, %rsp @@ -96,6 +98,12 @@ ftrace_call: movq (%rsp), %rax addq $0x38, %rsp +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + .globl ftrace_stub ftrace_stub: retq @@ -103,8 +111,20 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq @@ -140,6 +160,69 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) + + call ftrace_return_to_handler + + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $72, %rsp + retq +#endif + + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 0aa2c443d60..53699c931ad 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,8 +38,11 @@ #include <asm/io.h> #include <asm/nmi.h> #include <asm/smp.h> +#include <asm/atomic.h> #include <asm/apicdef.h> #include <mach_mpparse.h> +#include <asm/genapic.h> +#include <asm/setup.h> /* * ES7000 chipsets @@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; +} + +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + +static int __init es7000_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + + /* MPENTIUMIII */ + if (boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { + es7000_update_genapic_to_cluster(); + genapic->wait_for_init_deassert = noop_wait_for_deassert; + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + } + + return 0; +} + void __init setup_unisys(void) { @@ -176,6 +216,8 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; + + x86_quirks->update_genapic = es7000_update_genapic; } /* @@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg) return status; } -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - void __init es7000_sw_apic(void) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac8c9b..1b43086b097 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -14,14 +14,17 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> #include <asm/ftrace.h> +#include <linux/ftrace.h> #include <asm/nops.h> +#include <asm/nmi.h> -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,18 +34,12 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -int +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put the instruction pointer into the IP buffer + * and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status; /* holds return value of text write */ +static int mod_code_write; /* set when NMI should do the write */ +static void *mod_code_ip; /* holds the IP to write to */ +static void *mod_code_newcode; /* holds the text to write to the IP */ + +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); + +int ftrace_arch_read_dyn_info(char *buf, int size) +{ + int r; + + r = snprintf(buf, size, "%u %u", + nmi_wait_count, + atomic_read(&nmi_update_count)); + return r; +} + +static void ftrace_mod_code(void) +{ + /* + * Yes, more than one CPU process can be writing to mod_code_status. + * (and the code itself) + * But if one were to fail, then they all should, and if one were + * to succeed, then they all should. + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); +} + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); + /* Must have in_nmi seen before reading write flag */ + smp_mb(); + if (mod_code_write) { + ftrace_mod_code(); + atomic_inc(&nmi_update_count); + } +} + +void ftrace_nmi_exit(void) +{ + /* Finish all executions before clearing in_nmi */ + smp_wmb(); + atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ + int waited = 0; + + while (atomic_read(&in_nmi)) { + waited = 1; + cpu_relax(); + } + + if (waited) + nmi_wait_count++; +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ + mod_code_ip = (void *)ip; + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ + smp_wmb(); + + mod_code_write = 1; + + /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + + wait_for_nmi(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); + + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ + smp_wmb(); + + mod_code_write = 0; + + /* make sure NMIs see the cleared bit */ + smp_mb(); + + wait_for_nmi(); + + return mod_code_status; +} + + + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return -EINVAL; /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + if (do_ftrace_mod_code(ip, new_code)) return -EPERM; sync_core(); @@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static int ftrace_mod_jmp(unsigned long ip, + int old_offset, int new_offset) +{ + unsigned char code[MCOUNT_INSN_SIZE]; + + if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) + return -EINVAL; + + *(int *)(&code[1]) = new_offset; + + if (do_ftrace_mod_code(ip, &code)) + return -EPERM; + + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +#else /* CONFIG_DYNAMIC_FTRACE */ + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func, int *depth) +{ + int index; + + if (!current->ret_stack) + return -EBUSY; + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + index = ++current->curr_ret_stack; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; + *depth = index; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +{ + int index; + + index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; + barrier(); + current->curr_ret_stack--; + +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + pop_return_trace(&trace, &ret); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_graph_return(&trace); + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + struct ftrace_graph_ent trace; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (unlikely(atomic_read(&in_nmi))) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: " _ASM_MOV " (%[parent_old]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) + + : [parent_replaced] "=r" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); + return; + } + + if (unlikely(!__kernel_text_address(old))) { + ftrace_graph_stop(); + *parent = old; + WARN_ON(1); + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, + self_addr, &trace.depth) == -EBUSY) { + *parent = old; + return; + } + + trace.func = self_addr; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6c9bfc9e1e9..2bced78b0b8 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -21,6 +21,7 @@ #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <asm/setup.h> extern struct genapic apic_flat; extern struct genapic apic_physflat; @@ -53,6 +54,9 @@ void __init setup_apic_routing(void) genapic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index c0262791bda..7fa5f49c2dd 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static cpumask_t flat_target_cpus(void) +static const struct cpumask *flat_target_cpus(void) { - return cpu_online_map; + return cpu_online_mask; } -static cpumask_t flat_vector_allocation_domain(int cpu) +static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } /* @@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static void flat_send_IPI_mask(cpumask_t cpumask, int vector) +static inline void _flat_send_IPI_mask(unsigned long mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; unsigned long flags; local_irq_save(flags); @@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } +static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + + _flat_send_IPI_mask(mask, vector); +} + +static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + int cpu = smp_processor_id(); + + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); + _flat_send_IPI_mask(mask, vector); +} + static void flat_send_IPI_allbutself(int vector) { + int cpu = smp_processor_id(); #ifdef CONFIG_HOTPLUG_CPU int hotplug = 1; #else int hotplug = 0; #endif if (hotplug || vector == NMI_VECTOR) { - cpumask_t allbutme = cpu_online_map; + if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { + unsigned long mask = cpumask_bits(cpu_online_mask)[0]; - cpu_clear(smp_processor_id(), allbutme); + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); - if (!cpus_empty(allbutme)) - flat_send_IPI_mask(allbutme, vector); + _flat_send_IPI_mask(mask, vector); + } } else if (num_online_cpus() > 1) { __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); } @@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector) static void flat_send_IPI_all(int vector) { if (vector == NMI_VECTOR) - flat_send_IPI_mask(cpu_online_map, vector); + flat_send_IPI_mask(cpu_online_mask, vector); else __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } @@ -135,9 +155,18 @@ static int flat_apic_id_registered(void) return physid_isset(read_xapic_id(), phys_cpu_present_map); } -static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { - return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; + unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; + unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS; + + return mask1 & mask2; } static unsigned int phys_pkg_id(int index_msb) @@ -157,8 +186,10 @@ struct genapic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_allbutself = flat_send_IPI_allbutself, .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static cpumask_t physflat_target_cpus(void) +static const struct cpumask *physflat_target_cpus(void) { - return cpu_online_map; + return cpu_online_mask; } -static cpumask_t physflat_vector_allocation_domain(int cpu) +static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) { - return cpumask_of_cpu(cpu); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } -static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { send_IPI_mask_sequence(cpumask, vector); } -static void physflat_send_IPI_allbutself(int vector) +static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) { - cpumask_t allbutme = cpu_online_map; + send_IPI_mask_allbutself(cpumask, vector); +} - cpu_clear(smp_processor_id(), allbutme); - physflat_send_IPI_mask(allbutme, vector); +static void physflat_send_IPI_allbutself(int vector) +{ + send_IPI_mask_allbutself(cpu_online_mask, vector); } static void physflat_send_IPI_all(int vector) { - physflat_send_IPI_mask(cpu_online_map, vector); + physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -224,13 +259,29 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int +physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_any_and(cpumask, andmask); + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + struct genapic apic_physflat = { .name = "physical flat", .acpi_madt_oem_check = physflat_acpi_madt_oem_check, @@ -243,8 +294,10 @@ struct genapic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index f6a2c8eb48a..4716a0c9f93 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t x2apic_target_cpus(void) +static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } /* * for now each logical cpu is in its own vector allocation domain. */ -static cpumask_t x2apic_vector_allocation_domain(int cpu) +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, @@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, * at once. We have 16 cpu's in a cluster. This will minimize IPI register * writes. */ -static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned long flags; unsigned long query_cpu; local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { - __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); - } + for_each_cpu(query_cpu, mask) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } -static void x2apic_send_IPI_allbutself(int vector) +static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { - cpumask_t mask = cpu_online_map; + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - cpu_clear(smp_processor_id(), mask); + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - x2apic_send_IPI_mask(mask, vector); + local_irq_save(flags); + for_each_online_cpu(query_cpu) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_map, vector); + x2apic_send_IPI_mask(cpu_online_mask, vector); } static int x2apic_apic_id_registered(void) @@ -89,7 +109,7 @@ static int x2apic_apic_id_registered(void) return 1; } -static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -97,13 +117,28 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + cpu = cpumask_first(cpumask); + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_logical_apicid, cpu); else return BAD_APICID; } +static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_any_and(cpumask, andmask); + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -150,8 +185,10 @@ struct genapic apic_x2apic_cluster = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_allbutself = x2apic_send_IPI_allbutself, .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index d042211768b..b255507884f 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t x2apic_target_cpus(void) +static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t x2apic_vector_allocation_domain(int cpu) +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, @@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, x2apic_icr_write(cfg, apicid); } -static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned long flags; unsigned long query_cpu; local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { + for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } -static void x2apic_send_IPI_allbutself(int vector) +static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { - cpumask_t mask = cpu_online_map; + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} - cpu_clear(smp_processor_id(), mask); +static void x2apic_send_IPI_allbutself(int vector) +{ + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - x2apic_send_IPI_mask(mask, vector); + local_irq_save(flags); + for_each_online_cpu(query_cpu) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_map, vector); + x2apic_send_IPI_mask(cpu_online_mask, vector); } static int x2apic_apic_id_registered(void) @@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void) return 1; } -static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -95,13 +116,28 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + cpu = cpumask_first(cpumask); + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_any_and(cpumask, andmask); + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -145,8 +181,10 @@ struct genapic apic_x2apic_phys = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_allbutself = x2apic_send_IPI_allbutself, .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 2c7dbdb9827..3984682cd84 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -75,16 +75,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t uv_target_cpus(void) +static const struct cpumask *uv_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t uv_vector_allocation_domain(int cpu) +static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) @@ -123,28 +122,37 @@ static void uv_send_IPI_one(int cpu, int vector) uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } -static void uv_send_IPI_mask(cpumask_t mask, int vector) +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned int cpu; - for_each_possible_cpu(cpu) - if (cpu_isset(cpu, mask)) + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); + + for_each_cpu(cpu, mask) + if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - uv_send_IPI_mask(mask, vector); + for_each_online_cpu(cpu) + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_all(int vector) { - uv_send_IPI_mask(cpu_online_map, vector); + uv_send_IPI_mask(cpu_online_mask, vector); } static int uv_apic_id_registered(void) @@ -156,7 +164,7 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -164,13 +172,28 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_any_and(cpumask, andmask); + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -218,8 +241,10 @@ struct genapic apic_x2apic_uv_x = { .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f..e76d7e27297 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -246,7 +246,7 @@ static void hpet_legacy_clockevent_register(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); @@ -301,7 +301,7 @@ static void hpet_set_mode(enum clock_event_mode mode, struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); hpet_setup_msi_irq(hdev->irq); disable_irq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } break; @@ -449,7 +449,7 @@ static int hpet_setup_irq(struct hpet_dev *dev) return -1; disable_irq(dev->irq); - irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", @@ -500,7 +500,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) /* 5 usec minimum reprogramming delta. */ evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of_cpu(hdev->cpu); + evt->cpumask = cpumask_of(hdev->cpu); clockevents_register_device(evt); } diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c1b5e3ece1f..10f92fb532f 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -114,7 +114,7 @@ void __init setup_pit_timer(void) * Start pit with the boot cpu mask and make it global after the * IO_APIC has been initialized. */ - pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + pit_clockevent.cpumask = cpumask_of(smp_processor_id()); pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_clockevent.shift); pit_clockevent.max_delta_ns = diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210f..58938cc4b7d 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -108,94 +108,273 @@ static int __init parse_noapic(char *str) early_param("noapic", parse_noapic); struct irq_pin_list; + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +{ + struct irq_pin_list *pin; + int node; + + node = cpu_to_node(cpu); + + pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node); + + return pin; +} + struct irq_cfg { - unsigned int irq; struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; + cpumask_var_t domain; + cpumask_var_t old_domain; unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + u8 move_desc_pending : 1; +#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg irq_cfgx[] = { +#else static struct irq_cfg irq_cfgx[NR_IRQS] = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +#endif + [0] = { .vector = IRQ0_VECTOR, }, + [1] = { .vector = IRQ1_VECTOR, }, + [2] = { .vector = IRQ2_VECTOR, }, + [3] = { .vector = IRQ3_VECTOR, }, + [4] = { .vector = IRQ4_VECTOR, }, + [5] = { .vector = IRQ5_VECTOR, }, + [6] = { .vector = IRQ6_VECTOR, }, + [7] = { .vector = IRQ7_VECTOR, }, + [8] = { .vector = IRQ8_VECTOR, }, + [9] = { .vector = IRQ9_VECTOR, }, + [10] = { .vector = IRQ10_VECTOR, }, + [11] = { .vector = IRQ11_VECTOR, }, + [12] = { .vector = IRQ12_VECTOR, }, + [13] = { .vector = IRQ13_VECTOR, }, + [14] = { .vector = IRQ14_VECTOR, }, + [15] = { .vector = IRQ15_VECTOR, }, }; -#define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) +void __init arch_early_irq_init(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + int count; + int i; + + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); + + for (i = 0; i < count; i++) { + desc = irq_to_desc(i); + desc->chip_data = &cfg[i]; + alloc_bootmem_cpumask_var(&cfg[i].domain); + alloc_bootmem_cpumask_var(&cfg[i].old_domain); + if (i < NR_IRQS_LEGACY) + cpumask_setall(cfg[i].domain); + } +} +#ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return irq < nr_irqs ? irq_cfgx + irq : NULL; + struct irq_cfg *cfg = NULL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + if (desc) + cfg = desc->chip_data; + + return cfg; +} + +static struct irq_cfg *get_one_free_irq_cfg(int cpu) +{ + struct irq_cfg *cfg; + int node; + + node = cpu_to_node(cpu); + + cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + if (cfg) { + /* FIXME: needs alloc_cpumask_var_node() */ + if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) { + kfree(cfg); + cfg = NULL; + } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) { + free_cpumask_var(cfg->domain); + kfree(cfg); + cfg = NULL; + } else { + cpumask_clear(cfg->domain); + cpumask_clear(cfg->old_domain); + } + } + printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node); + + return cfg; } -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +void arch_init_chip_data(struct irq_desc *desc, int cpu) { - return irq_cfg(irq); + struct irq_cfg *cfg; + + cfg = desc->chip_data; + if (!cfg) { + desc->chip_data = get_one_free_irq_cfg(cpu); + if (!desc->chip_data) { + printk(KERN_ERR "can not alloc irq_cfg\n"); + BUG_ON(1); + } + } } -/* - * Rough estimation of how many shared IRQs there are, can be changed - * anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ +static void +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +{ + struct irq_pin_list *old_entry, *head, *tail, *entry; -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; + cfg->irq_2_pin = NULL; + old_entry = old_cfg->irq_2_pin; + if (!old_entry) + return; + + entry = get_one_free_irq_2_pin(cpu); + if (!entry) + return; + + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + head = entry; + tail = entry; + old_entry = old_entry->next; + while (old_entry) { + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + entry = head; + while (entry) { + head = entry->next; + kfree(entry); + entry = head; + } + /* still use the old one */ + return; + } + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + tail->next = entry; + tail = entry; + old_entry = old_entry->next; + } -static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; -static struct irq_pin_list *irq_2_pin_ptr; + tail->next = NULL; + cfg->irq_2_pin = head; +} -static void __init irq_2_pin_init(void) +static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) { - struct irq_pin_list *pin = irq_2_pin_head; - int i; + struct irq_pin_list *entry, *next; + + if (old_cfg->irq_2_pin == cfg->irq_2_pin) + return; - for (i = 1; i < PIN_MAP_SIZE; i++) - pin[i-1].next = &pin[i]; + entry = old_cfg->irq_2_pin; - irq_2_pin_ptr = &pin[0]; + while (entry) { + next = entry->next; + kfree(entry); + entry = next; + } + old_cfg->irq_2_pin = NULL; } -static struct irq_pin_list *get_one_free_irq_2_pin(void) +void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) { - struct irq_pin_list *pin = irq_2_pin_ptr; + struct irq_cfg *cfg; + struct irq_cfg *old_cfg; - if (!pin) - panic("can not get more irq_2_pin\n"); + cfg = get_one_free_irq_cfg(cpu); - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; + if (!cfg) + return; + + desc->chip_data = cfg; + + old_cfg = old_desc->chip_data; + + memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + + init_copy_irq_2_pin(old_cfg, cfg, cpu); +} + +static void free_irq_cfg(struct irq_cfg *old_cfg) +{ + kfree(old_cfg); } +void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +{ + struct irq_cfg *old_cfg, *cfg; + + old_cfg = old_desc->chip_data; + cfg = desc->chip_data; + + if (old_cfg == cfg) + return; + + if (old_cfg) { + free_irq_2_pin(old_cfg, cfg); + free_irq_cfg(old_cfg); + old_desc->chip_data = NULL; + } +} + +static void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg = desc->chip_data; + + if (!cfg->move_in_progress) { + /* it means that domain is not changed */ + if (!cpumask_intersects(&desc->affinity, mask)) + cfg->move_desc_pending = 1; + } +} +#endif + +#else +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +#endif + +#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC +static inline void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ +} +#endif + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -237,11 +416,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(unsigned int irq) +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { struct irq_pin_list *entry; unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); entry = cfg->irq_2_pin; @@ -323,13 +501,32 @@ static void ioapic_mask_entry(int apic, int pin) } #ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; + u8 vector = cfg->vector; - cfg = irq_cfg(irq); entry = cfg->irq_2_pin; for (;;) { unsigned int reg; @@ -359,36 +556,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) } } -static int assign_irq_vector(int irq, cpumask_t mask); +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +/* + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid + * of that, or returns BAD_APICID and leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_and(&desc->affinity, cfg->domain, mask); + set_extra_move_desc(desc, mask); + return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); +} + +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; + irq = desc->irq; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); + + set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -397,16 +619,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) +static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) { - struct irq_cfg *cfg; struct irq_pin_list *entry; - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(); + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", + apic, pin); + return; + } cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; @@ -421,7 +645,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(); + entry->next = get_one_free_irq_2_pin(cpu); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -430,11 +654,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq(unsigned int irq, +static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_cfg *cfg = irq_cfg(irq); struct irq_pin_list *entry = cfg->irq_2_pin; int replaced = 0; @@ -451,18 +674,16 @@ static void __init replace_pin_at_irq(unsigned int irq, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq(irq, newapic, newpin); + add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); } -static inline void io_apic_modify_irq(unsigned int irq, +static inline void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { int pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; - cfg = irq_cfg(irq); for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; pin = entry->pin; @@ -475,9 +696,9 @@ static inline void io_apic_modify_irq(unsigned int irq, } } -static void __unmask_IO_APIC_irq(unsigned int irq) +static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } #ifdef CONFIG_X86_64 @@ -492,47 +713,64 @@ void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } #else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); } -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, IO_APIC_REDIR_MASKED, NULL); } -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } #endif /* CONFIG_X86_32 */ -static void mask_IO_APIC_irq (unsigned int irq) +static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; + BUG_ON(!cfg); + spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); + __mask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq (unsigned int irq) +static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void mask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq_desc(desc); +} +static void unmask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unmask_IO_APIC_irq_desc(desc); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -809,7 +1047,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); */ static int EISA_ELCR(unsigned int irq) { - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1034,7 +1272,8 @@ void unlock_vector_lock(void) spin_unlock(&vector_lock); } -static int __assign_irq_vector(int irq, cpumask_t mask) +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -1049,39 +1288,39 @@ static int __assign_irq_vector(int irq, cpumask_t mask) */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + int cpu, err; + cpumask_var_t tmp_mask; if ((cfg->move_in_progress) || cfg->move_cleanup_count) return -EBUSY; + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + old_vector = cfg->vector; if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); return 0; + } } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + vector_allocation_domain(cpu, tmp_mask); vector = current_vector; offset = current_offset; next: vector += 8; if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ + /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } @@ -1094,7 +1333,7 @@ next: if (vector == SYSCALL_VECTOR) goto next; #endif - for_each_cpu_mask_nr(new_cpu, new_mask) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -1102,49 +1341,47 @@ next: current_offset = offset; if (old_vector) { cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; + cpumask_copy(cfg->old_domain, cfg->domain); } - for_each_cpu_mask_nr(new_cpu, new_mask) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; - cfg->domain = domain; - return 0; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; } - return -ENOSPC; + free_cpumask_var(tmp_mask); + return err; } -static int assign_irq_vector(int irq, cpumask_t mask) +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); + err = __assign_irq_vector(irq, cfg, mask); spin_unlock_irqrestore(&vector_lock, flags); return err; } -static void __clear_irq_vector(int irq) +static void __clear_irq_vector(int irq, struct irq_cfg *cfg) { - struct irq_cfg *cfg; - cpumask_t mask; int cpu, vector; - cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cpus_clear(cfg->domain); + cpumask_clear(cfg->domain); if (likely(!cfg->move_in_progress)) return; - cpus_and(mask, cfg->old_domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -1162,10 +1399,14 @@ void __setup_vector_irq(int cpu) /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; + struct irq_desc *desc; /* Mark the inuse vectors */ - for_each_irq_cfg(irq, cfg) { - if (!cpu_isset(cpu, cfg->domain)) + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + cfg = desc->chip_data; + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; @@ -1177,7 +1418,7 @@ void __setup_vector_irq(int cpu) continue; cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -1215,11 +1456,8 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) @@ -1311,23 +1549,22 @@ static int setup_ioapic_entry(int apic, int irq, return 0; } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, int trigger, int polarity) { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - cpumask_t mask; + unsigned int dest; if (!IO_APIC_IRQ(irq)) return; - cfg = irq_cfg(irq); + cfg = desc->chip_data; - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) + if (assign_irq_vector(irq, cfg, TARGET_CPUS)) return; - cpus_and(mask, cfg->domain, mask); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1337,16 +1574,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { + dest, trigger, polarity, cfg->vector)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); return; } - ioapic_register_intr(irq, trigger); - if (irq < 16) + ioapic_register_intr(irq, desc, trigger); + if (irq < NR_IRQS_LEGACY) disable_8259A_irq(irq); ioapic_write_entry(apic, pin, entry); @@ -1356,6 +1592,9 @@ static void __init setup_IO_APIC_irqs(void) { int apic, pin, idx, irq; int notcon = 0; + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1387,9 +1626,15 @@ static void __init setup_IO_APIC_irqs(void) if (multi_timer_check(apic, irq)) continue; #endif - add_pin_to_irq(irq, apic, pin); + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, + setup_IO_APIC_irq(apic, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } } @@ -1448,6 +1693,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + struct irq_desc *desc; unsigned int irq; if (apic_verbosity == APIC_QUIET) @@ -1537,8 +1783,13 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(irq, cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; + for_each_irq_desc(irq, desc) { + struct irq_pin_list *entry; + + if (!desc) + continue; + cfg = desc->chip_data; + entry = cfg->irq_2_pin; if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); @@ -2022,14 +2273,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; + struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; } - __unmask_IO_APIC_irq(irq); + cfg = irq_cfg(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2043,7 +2296,7 @@ static int ioapic_retrigger_irq(unsigned int irq) unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2092,35 +2345,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); * as simple as edge triggered migration and we can do the irq migration * with a simple atomic update to IO-APIC RTE. */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) +static void +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; struct irte irte; int modify_ioapic_rte; unsigned int dest; unsigned long flags; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpumask_intersects(mask, cpu_online_mask)) return; + irq = desc->irq; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + set_extra_move_desc(desc, mask); + + dest = cpu_mask_to_apicid_and(cfg->domain, mask); - desc = irq_to_desc(irq); modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); + __target_IO_APIC_irq(irq, dest, cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2132,24 +2385,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) */ modify_irte(irq, &irte); - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (cfg->move_in_progress) + send_cleanup_vector(cfg); - desc->affinity = mask; + cpumask_copy(&desc->affinity, mask); } -static int migrate_irq_remapped_level(int irq) +static int migrate_irq_remapped_level_desc(struct irq_desc *desc) { int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); - if (io_apic_level_ack_pending(irq)) { + if (io_apic_level_ack_pending(cfg)) { /* * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse @@ -2161,14 +2410,15 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); + migrate_ioapic_irq_desc(desc, &desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); unmask: - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); + return ret; } @@ -2178,6 +2428,9 @@ static void ir_irq_migration(struct work_struct *work) struct irq_desc *desc; for_each_irq_desc(irq, desc) { + if (!desc) + continue; + if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -2189,7 +2442,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, desc->pending_mask); + desc->chip->set_affinity(irq, &desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -2198,18 +2451,24 @@ static void ir_irq_migration(struct work_struct *work) /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); + cpumask_copy(&desc->pending_mask, mask); + migrate_irq_remapped_level_desc(desc); return; } - migrate_ioapic_irq(irq, mask); + migrate_ioapic_irq_desc(desc, mask); +} +static void set_ir_ioapic_affinity_irq(unsigned int irq, + const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + set_ir_ioapic_affinity_irq_desc(desc, mask); } #endif @@ -2229,6 +2488,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; + if (irq == -1) + continue; + desc = irq_to_desc(irq); if (!desc) continue; @@ -2238,7 +2500,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (!cfg->move_cleanup_count) goto unlock; - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; __get_cpu_var(vector_irq)[vector] = -1; @@ -2250,28 +2512,44 @@ unlock: irq_exit(); } -static void irq_complete_move(unsigned int irq) +static void irq_complete_move(struct irq_desc **descp) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_desc *desc = *descp; + struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) + if (likely(!cfg->move_in_progress)) { +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + if (likely(!cfg->move_desc_pending)) + return; + + /* domain is not change, but affinity is changed */ + me = smp_processor_id(); + if (cpu_isset(me, desc->affinity)) { + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; + cfg->move_desc_pending = 0; + } +#endif return; + } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; +#endif - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); } #else -static inline void irq_complete_move(unsigned int irq) {} +static inline void irq_complete_move(struct irq_desc **descp) {} #endif + #ifdef CONFIG_INTR_REMAP static void ack_x2apic_level(unsigned int irq) { @@ -2282,11 +2560,14 @@ static void ack_x2apic_edge(unsigned int irq) { ack_x2APIC_irq(); } + #endif static void ack_apic_edge(unsigned int irq) { - irq_complete_move(irq); + struct irq_desc *desc = irq_to_desc(irq); + + irq_complete_move(&desc); move_native_irq(irq); ack_APIC_irq(); } @@ -2295,18 +2576,21 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); + #ifdef CONFIG_X86_32 unsigned long v; int i; #endif + struct irq_cfg *cfg; int do_unmask_irq = 0; - irq_complete_move(irq); + irq_complete_move(&desc); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); } #endif @@ -2330,7 +2614,8 @@ static void ack_apic_level(unsigned int irq) * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = irq_cfg(irq)->vector; + cfg = desc->chip_data; + i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); #endif @@ -2369,17 +2654,18 @@ static void ack_apic_level(unsigned int irq) * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(irq)) + cfg = desc->chip_data; + if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); } #ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } #endif @@ -2430,20 +2716,22 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(irq, cfg) { - if (IO_APIC_IRQ(irq) && !cfg->vector) { + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + + cfg = desc->chip_data; + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < 16) + if (irq < NR_IRQS_LEGACY) make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); + else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; - } } } } @@ -2468,7 +2756,7 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void ack_lapic_irq(unsigned int irq) { ack_APIC_irq(); } @@ -2480,11 +2768,8 @@ static struct irq_chip lapic_chip __read_mostly = { .ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq) +static void lapic_register_intr(int irq, struct irq_desc *desc) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); @@ -2588,7 +2873,9 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_desc *desc = irq_to_desc(0); + struct irq_cfg *cfg = desc->chip_data; + int cpu = boot_cpu_id; int apic1, pin1, apic2, pin2; unsigned long flags; unsigned int ver; @@ -2603,7 +2890,7 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); + assign_irq_vector(0, cfg, TARGET_CPUS); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2654,10 +2941,10 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); + add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); @@ -2683,9 +2970,9 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2717,7 +3004,7 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0); + lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2902,22 +3189,26 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int irq; unsigned int new; unsigned long flags; - struct irq_cfg *cfg_new; - - irq_want = nr_irqs - 1; + struct irq_cfg *cfg_new = NULL; + int cpu = boot_cpu_id; + struct irq_desc *desc_new = NULL; irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { + for (new = irq_want; new < NR_IRQS; new++) { if (platform_legacy_irq(new)) continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) + + desc_new = irq_to_desc_alloc_cpu(new, cpu); + if (!desc_new) { + printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) + } + cfg_new = desc_new->chip_data; + + if (cfg_new->vector != 0) + continue; + if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) irq = new; break; } @@ -2925,15 +3216,21 @@ unsigned int create_irq_nr(unsigned int irq_want) if (irq > 0) { dynamic_irq_init(irq); + /* restore it, in case dynamic_irq_init clear it */ + if (desc_new) + desc_new->chip_data = cfg_new; } return irq; } +static int nr_irqs_gsi = NR_IRQS_LEGACY; int create_irq(void) { + unsigned int irq_want; int irq; - irq = create_irq_nr(nr_irqs - 1); + irq_want = nr_irqs_gsi; + irq = create_irq_nr(irq_want); if (irq == 0) irq = -1; @@ -2944,14 +3241,22 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc; + /* store it, in case dynamic_irq_cleanup clear it */ + desc = irq_to_desc(irq); + cfg = desc->chip_data; dynamic_irq_cleanup(irq); + /* connect back irq_cfg */ + if (desc) + desc->chip_data = cfg; #ifdef CONFIG_INTR_REMAP free_irte(irq); #endif spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } @@ -2964,16 +3269,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms struct irq_cfg *cfg; int err; unsigned dest; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (err) return err; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { @@ -3027,64 +3329,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); + read_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + write_msi_msg_desc(desc, &msg); } - #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp, cleanup_mask; struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3098,16 +3384,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; + if (cfg->move_in_progress) + send_cleanup_vector(cfg); } + #endif #endif /* CONFIG_SMP */ @@ -3166,7 +3446,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) } #endif -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { int ret; struct msi_msg msg; @@ -3175,7 +3455,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, desc); + set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); #ifdef CONFIG_INTR_REMAP @@ -3195,26 +3475,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) { unsigned int irq; int ret; unsigned int irq_want; - irq_want = build_irq_for_pci_dev(dev) + 0x100; - + irq_want = nr_irqs_gsi; irq = create_irq_nr(irq_want); if (irq == 0) return -1; @@ -3228,7 +3495,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) goto error; no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) { destroy_irq(irq); return ret; @@ -3246,7 +3513,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { unsigned int irq; int ret, sub_handle; - struct msi_desc *desc; + struct msi_desc *msidesc; unsigned int irq_want; #ifdef CONFIG_INTR_REMAP @@ -3254,10 +3521,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int index = 0; #endif - irq_want = build_irq_for_pci_dev(dev) + 0x100; + irq_want = nr_irqs_gsi; sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want); + irq_want++; if (irq == 0) return -1; #ifdef CONFIG_INTR_REMAP @@ -3289,7 +3557,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) } no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; sub_handle++; @@ -3308,24 +3576,18 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - if (assign_irq_vector(irq, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; dmar_msi_read(irq, &msg); @@ -3335,9 +3597,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip dmar_msi_type = { @@ -3369,24 +3630,18 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; - struct irq_desc *desc; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; hpet_msi_read(irq, &msg); @@ -3396,9 +3651,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip hpet_msi_type = { @@ -3451,28 +3705,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - if (assign_irq_vector(irq, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif static struct irq_chip ht_irq_chip = { @@ -3490,17 +3737,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; int err; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (!err) { struct ht_irq_msg msg; unsigned dest; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -3536,7 +3780,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset) { - const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@ -3544,7 +3788,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; - err = assign_irq_vector(irq, *eligible_cpu); + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; @@ -3553,8 +3799,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, irq_name); spin_unlock_irqrestore(&vector_lock, flags); - cfg = irq_cfg(irq); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3565,7 +3809,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(*eligible_cpu); + entry->dest = cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3606,9 +3850,16 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries; } -int __init probe_nr_irqs(void) +void __init probe_nr_irqs_gsi(void) { - return NR_IRQS; + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; } /* -------------------------------------------------------------------------- @@ -3707,19 +3958,31 @@ int __init io_apic_get_version(int ioapic) int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; + if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + } - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); return 0; } @@ -3757,7 +4020,7 @@ void __init setup_ioapic_dest(void) int pin, ioapic, irq, irq_entry; struct irq_desc *desc; struct irq_cfg *cfg; - cpumask_t mask; + const struct cpumask *mask; if (skip_ioapic_setup == 1) return; @@ -3773,9 +4036,10 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - cfg = irq_cfg(irq); + desc = irq_to_desc(irq); + cfg = desc->chip_data; if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, + setup_IO_APIC_irq(ioapic, pin, irq, desc, irq_trigger(irq_entry), irq_polarity(irq_entry)); continue; @@ -3785,19 +4049,18 @@ void __init setup_ioapic_dest(void) /* * Honour affinities which have been set in early boot */ - desc = irq_to_desc(irq); if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = &desc->affinity; else mask = TARGET_CPUS; #ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, mask); + set_ir_ioapic_affinity_irq_desc(desc, mask); else #endif - set_ioapic_affinity_irq(irq, mask); + set_ioapic_affinity_irq_desc(desc, mask); } } @@ -3846,7 +4109,6 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index f1c688e46f3..285bbf8831f 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * This is only used on smaller machines. */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; local_irq_save(flags); - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __send_IPI_dest_field(mask, vector); local_irq_restore(flags); } -void send_IPI_mask_sequence(cpumask_t mask, int vector) +void send_IPI_mask_sequence(const struct cpumask *mask, int vector) { unsigned long flags; unsigned int query_cpu; @@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for_each_possible_cpu(query_cpu) { - if (cpu_isset(query_cpu, mask)) { + for_each_cpu(query_cpu, mask) + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); + local_irq_restore(flags); +} + +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); - } - } local_irq_restore(flags); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..3f1d9d18df6 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v) } desc = irq_to_desc(i); + if (!desc) + return 0; + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a51382672de..9dc5588f336 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_HOTPLUG_CPU #include <mach_apic.h> -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; + const struct cpumask *affinity; + if (!desc) + continue; if (irq == 2) continue; - cpus_and(mask, desc->affinity, map); - if (any_online_cpu(mask) == NR_CPUS) { + affinity = &desc->affinity; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { printk("Breaking affinity for irq %i\n", irq); - mask = map; + affinity = cpu_all_mask; } if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a..fca2991443f 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,40 +83,43 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) } #ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + const struct cpumask *affinity; + if (!desc) + continue; if (irq == 2) continue; /* interrupt's are disabled at this point */ spin_lock(&desc->lock); + affinity = &desc->affinity; if (!irq_has_action(irq) || - cpus_equal(desc->affinity, map)) { + cpumask_equal(affinity, cpu_online_mask)) { spin_unlock(&desc->lock); continue; } - cpus_and(mask, desc->affinity, map); - if (cpus_empty(mask)) { + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - mask = map; + affinity = cpu_all_mask; } if (desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (!(warned++)) set_affinity = 0; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa9803e8..6a92f47c52e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -68,8 +68,7 @@ void __init init_ISA_irqs (void) /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff023539128..40c1e62ec78 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,8 +142,7 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 7a385746509..37f420018a4 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,6 +13,7 @@ #include <linux/numa.h> #include <linux/ftrace.h> #include <linux/suspend.h> +#include <linux/gfp.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -25,15 +26,6 @@ #include <asm/system.h> #include <asm/cacheflush.h> -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u32 kexec_pgd[1024] PAGE_ALIGNED; -#ifdef CONFIG_X86_PAE -static u32 kexec_pmd0[1024] PAGE_ALIGNED; -static u32 kexec_pmd1[1024] PAGE_ALIGNED; -#endif -static u32 kexec_pte0[1024] PAGE_ALIGNED; -static u32 kexec_pte1[1024] PAGE_ALIGNED; - static void set_idt(void *newidt, __u16 limit) { struct desc_ptr curidt; @@ -76,6 +68,76 @@ static void load_segments(void) #undef __STR } +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = 0; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -87,12 +149,20 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Make control page executable. + * - Make control page executable. + * - Allocate page tables + * - Setup page tables */ int machine_kexec_prepare(struct kimage *image) { + int error; + if (nx_enabled) set_pages_x(image->control_code_page, 1); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); return 0; } @@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image) { if (nx_enabled) set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); } /* @@ -150,18 +221,7 @@ void machine_kexec(struct kimage *image) relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_PGD] = __pa(kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; -#ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PMD_1] = __pa(kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; -#endif - page_list[PA_PTE_0] = __pa(kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PTE_1] = __pa(kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PGD] = __pa(image->arch.pgd); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3b599518c32..c12314c9e86 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = { .set_mode = mfgpt_set_mode, .set_next_event = mfgpt_next_event, .rating = 250, - .cpumask = CPU_MASK_ALL, + .cpumask = cpu_all_mask, .shift = 32 }; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f4c1fd5a1f..45e3b69808b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } + if (!mpf) + return; + if (acpi_lapic && early) return; + /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + if (acpi_lapic && acpi_ioapic) return; - } else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); - if (!mpf) - return; + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) + return; + } printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 4caff39078e..0deea37a53c 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@ #include <asm/numaq.h> #include <asm/topology.h> #include <asm/processor.h> -#include <asm/mpspec.h> +#include <asm/genapic.h> #include <asm/e820.h> #include <asm/setup.h> @@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void) return 1; } +static int __init numaq_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; + + return 0; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, + .update_genapic = numaq_update_genapic, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..95d811a9594 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,7 +7,9 @@ #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <linux/ftrace.h> #include <asm/system.h> +#include <asm/apic.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -100,6 +102,9 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -112,6 +117,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -122,6 +128,21 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_local_APIC(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + static void do_nothing(void *unused) { } @@ -154,24 +175,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(ax, cx); } + trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { + struct power_trace it; if (!need_resched()) { + trace_power_start(&it, POWER_CSTATE, 1); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(&it); } else local_irq_enable(); } @@ -183,9 +211,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); + trace_power_end(&it); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d4..24c2276aa45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include <linux/percpu.h> #include <linux/prctl.h> #include <linux/dmi.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c958120fb1b..fbb321d53d3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@ #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/ftrace.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. */ -struct task_struct * +__notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a6d8c12e10..2c8ec1ba75e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, size_t bts_index, bts_end; int error; - error = ds_get_bts_end(child, &bts_end); + error = ds_get_bts_end(child->bts, &bts_end); if (error < 0) return error; if (bts_end <= index) return -EINVAL; - error = ds_get_bts_index(child, &bts_index); + error = ds_get_bts_index(child->bts, &bts_index); if (error < 0) return error; @@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (bts_end <= bts_index) bts_index -= bts_end; - error = ds_access_bts(child, bts_index, &bts_record); + error = ds_access_bts(child->bts, bts_index, &bts_record); if (error < 0) return error; @@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child, size_t end, i; int error; - error = ds_get_bts_index(child, &end); + error = ds_get_bts_index(child->bts, &end); if (error < 0) return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - error = ds_access_bts(child, 0, (const void **)&raw); + error = ds_access_bts(child->bts, 0, (const void **)&raw); if (error < 0) return error; @@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child, return -EFAULT; } - error = ds_clear_bts(child); + error = ds_clear_bts(child->bts); if (error < 0) return error; return end; } -static void ptrace_bts_ovfl(struct task_struct *child) -{ - send_sig(child->thread.bts_ovfl_signal, child, 0); -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child, goto errout; if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = NULL; + bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; - /* we ignore the error in case we were not tracing child */ - (void)ds_release_bts(child); + error = -EINVAL; + if (cfg.size < (10 * bts_cfg.sizeof_bts)) + goto errout; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; + error = -EOPNOTSUPP; + goto errout; + sig = cfg.signal; - ovfl = ptrace_bts_ovfl; } - error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); - if (error < 0) + if (child->bts) { + (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + + child->bts = NULL; + child->bts_buffer = NULL; + } + + error = -ENOMEM; + child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); + if (!child->bts_buffer) + goto errout; + + child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, + ovfl, /* th = */ (size_t)-1); + if (IS_ERR(child->bts)) { + error = PTR_ERR(child->bts); + kfree(child->bts_buffer); + child->bts = NULL; + child->bts_buffer = NULL; goto errout; + } child->thread.bts_ovfl_signal = sig; } @@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child, if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child, &end); + error = ds_get_bts_end(child->bts, &end); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ 0, &base); + error = ds_access_bts(child->bts, /* index = */ 0, &base); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ end, &max); + error = ds_access_bts(child->bts, /* index = */ end, &max); if (error < 0) return error; @@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child, return -EINVAL; } - /* The writing task will be the switched-to task on a context - * switch. It needs to write into the switched-from task's BTS - * buffer. */ - return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); + return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ bts_configure(&bts_cfg_pentium_m); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ + default: /* Core2, Atom, ... */ bts_configure(&bts_cfg_core2); break; - default: - /* sorry, don't know about them */ - break; } break; case 0xF: @@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child) clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif #ifdef CONFIG_X86_PTRACE_BTS - (void)ds_release_bts(child); + if (child->bts) { + (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + child->bts_buffer = NULL; - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ NULL); + case PTRACE_BTS_SIZE: { + size_t size; + + ret = ds_get_bts_index(child->bts, &size); + if (ret == 0) { + BUG_ON(size != (int) size); + ret = (int) size; + } break; + } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child); + ret = ds_clear_bts(child->bts); break; case PTRACE_BTS_DRAIN: diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index cc5a2545dd4..ba7b9a0e606 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -21,6 +21,9 @@ # include <asm/iommu.h> #endif +#include <mach_ipi.h> + + /* * Power off function, if any */ @@ -36,7 +39,10 @@ int reboot_force; static int reboot_cpu = -1; #endif -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] warm Don't set the cold reboot flag cold Set the cold reboot flag bios Reboot by jumping through the BIOS (only for X86_32) @@ -45,6 +51,7 @@ static int reboot_cpu = -1; kbd Use the keyboard controller. cold reset (default) acpi Use the RESET_REG in the FADT efi Use efi reset_system runtime service + pci Use the so-called "PCI reset register", CF9 force Avoid anything that could hang. */ static int __init reboot_setup(char *str) @@ -79,6 +86,7 @@ static int __init reboot_setup(char *str) case 'k': case 't': case 'e': + case 'p': reboot_type = *str; break; @@ -404,12 +412,27 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; - case BOOT_EFI: if (efi_enabled) - efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + efi.reset_system(reboot_mode ? + EFI_RESET_WARM : + EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + reboot_type = BOOT_KBD; + break; + case BOOT_CF9: + port_cf9_safe = true; + /* fall through */ + + case BOOT_CF9_COND: + if (port_cf9_safe) { + u8 cf9 = inb(0xcf9) & ~6; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + outb(cf9|6, 0xcf9); /* Actually do the reset */ + udelay(50); + } reboot_type = BOOT_KBD; break; } @@ -470,6 +493,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { + /* stop other cpus and apics */ + machine_shutdown(); + + /* stop this cpu */ + stop_this_cpu(NULL); } static void native_machine_power_off(void) @@ -523,3 +551,92 @@ void machine_crash_shutdown(struct pt_regs *regs) machine_ops.crash_shutdown(regs); } #endif + + +#if defined(CONFIG_SMP) + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + cpu = raw_smp_processor_id(); + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NOTIFY_STOP; + local_irq_disable(); + + shootdown_callback(cpu, (struct die_args *)data); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(NMI_VECTOR); +} + +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + +/* Halt all other CPUs, calling the specified function on each of them + * + * This function can be used to halt all other CPUs on crash + * or emergency reboot time. The function passed as parameter + * will be called inside a NMI handler on all CPUs. + */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ +} +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} +#endif diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 6f50664b2ba..a160f311972 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -10,15 +10,12 @@ #include <asm/page.h> #include <asm/kexec.h> #include <asm/processor-flags.h> -#include <asm/pgtable.h> /* * Must be relocatable PIC code callable as a C function */ #define PTR(x) (x << 2) -#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define PAE_PGD_ATTR (_PAGE_PRESENT) /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for @@ -39,7 +36,6 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .align PAGE_SIZE .globl relocate_kernel relocate_kernel: /* Save the CPU context, used for jumping back */ @@ -60,117 +56,6 @@ relocate_kernel: movl %cr4, %eax movl %eax, CR4(%edi) -#ifdef CONFIG_X86_PAE - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_0)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_1)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#else - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#endif - -relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 20+4(%esp), %ebx /* page_list */ movl 20+8(%esp), %ebp /* list of pages */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bdec76e5559..32fe42f26e7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -583,7 +583,20 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; +static int __init default_update_genapic(void) +{ +#ifdef CONFIG_X86_SMP +# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +# endif +#endif + + return 0; +} + +static struct x86_quirks default_x86_quirks __initdata = { + .update_genapic = default_update_genapic, +}; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; @@ -1080,7 +1093,7 @@ void __init setup_arch(char **cmdline_p) ioapic_init_mappings(); /* need to wait for io_apic is mapped */ - nr_irqs = probe_nr_irqs(); + probe_nr_irqs_gsi(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ae0c0d3bb77..0b63b08e753 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void) old_size = PERCPU_ENOUGH_ROOM; align = max_t(unsigned long, PAGE_SIZE, align); size = roundup(old_size, align); + + printk(KERN_INFO + "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void) "cpu %d has no node %d or node-local memory\n", cpu, node); if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", + printk(KERN_DEBUG + "per cpu data for cpu%d at %016lx\n", cpu, __pa(ptr)); } else { ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, __pa(MAX_DMA_ADDRESS)); if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); + printk(KERN_DEBUG + "per cpu data for cpu%d on node%d " + "at %016lx\n", + cpu, node, __pa(ptr)); } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", - NR_CPUS, nr_cpu_ids, nr_node_ids); - /* Setup percpu data maps */ setup_per_cpu_maps(); @@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) else cpu_clear(cpu, *mask); - cpulist_scnprintf(buf, sizeof(buf), *mask); + cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8..49ed667b06f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -118,41 +118,28 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); } -void native_send_call_func_ipi(cpumask_t mask) +void native_send_call_func_ipi(const struct cpumask *mask) { cpumask_t allbutself; allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - if (cpus_equal(mask, allbutself) && + if (cpus_equal(*mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_local_APIC(); - if (hlt_works(smp_processor_id())) - for (;;) halt(); - for (;;); -} - /* * this function calls the 'stop' function on all other CPUs in the system. */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f71f96fc9e6..be946678804 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include <asm/mtrr.h> #include <asm/vmi.h> #include <asm/genapic.h> +#include <asm/setup.h> #include <linux/mc146818rtc.h> #include <mach_apic.h> @@ -101,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; -/* bitmap of online cpus */ -cpumask_t cpu_online_map __read_mostly; -EXPORT_SYMBOL(cpu_online_map); - cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); @@ -534,7 +529,7 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1.\n"); } -static inline void __inquire_remote_apic(int apicid) +void __inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; @@ -573,14 +568,13 @@ static inline void __inquire_remote_apic(int apicid) } } -#ifdef WAKE_SECONDARY_VIA_NMI /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __devinit -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -597,7 +591,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -612,11 +606,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_NMI */ -#ifdef WAKE_SECONDARY_VIA_INIT -static int __devinit -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt, num_starts, j; @@ -735,7 +727,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_INIT */ struct create_idle { struct work_struct work; @@ -1353,7 +1344,7 @@ void cpu_disable_common(void) lock_vector_lock(); remove_cpu_from_maps(cpu); unlock_vector_lock(); - fixup_irqs(cpu_online_map); + fixup_irqs(); } int native_cpu_disable(void) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6d90c..10786af9554 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/stacktrace.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/stacktrace.h> static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = + frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } +} + +void save_stack_trace_user(struct stack_trace *trace) +{ + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + __save_stack_trace_user(trace); + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b..174ea90d1cb 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -164,7 +164,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 8f919ca6949..de6f1bda0c5 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); while (!cpus_empty(f->flush_cpumask)) cpu_relax(); diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 254ee07f863..c4c1f9e0940 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) /* Upper bound is clockevent's use of ulong for cycle deltas. */ evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); evt->min_delta_ns = clockevent_delta2ns(1, evt); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", evt->name, evt->mult, evt->shift); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690a86..6f3d3d4cd97 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,6 +17,9 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a5d8e1ace1c..104c8220a38 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -737,7 +737,7 @@ static void lguest_time_init(void) /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of_cpu(0); + lguest_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9e68075544f..4a20b2f9a38 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #define __do_strncpy_from_user(dst, src, count, res) \ do { \ int __d0, __d1, __d2; \ - might_sleep(); \ + might_fault(); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user); #define __do_clear_user(addr,size) \ do { \ int __d0; \ - might_sleep(); \ + might_fault(); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -155,7 +155,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - might_sleep(); + might_fault(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; @@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n) unsigned long mask = -__addr_ok(s); unsigned long res, tmp; - might_sleep(); + might_fault(); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index f4df6e7c718..64d6c84e635 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -15,7 +15,7 @@ #define __do_strncpy_from_user(dst,src,count,res) \ do { \ long __d0, __d1, __d2; \ - might_sleep(); \ + might_fault(); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user); unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; - might_sleep(); + might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 3c3b471ea49..bc4c7840b2a 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -17,6 +17,7 @@ #include <asm/bigsmp/apic.h> #include <asm/bigsmp/ipi.h> #include <asm/mach-default/mach_mpparse.h> +#include <asm/mach-default/mach_wakecpu.h> static int dmi_bigsmp; /* can be set by dmi scanners */ @@ -41,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { - return cpumask_of_cpu(cpu); + cpus_clear(*retmask); + cpu_set(cpu, *retmask); } static int probe_bigsmp(void) diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 9e835a11a13..e63a4a76d8c 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -16,6 +16,7 @@ #include <asm/mach-default/mach_apic.h> #include <asm/mach-default/mach_ipi.h> #include <asm/mach-default/mach_mpparse.h> +#include <asm/mach-default/mach_wakecpu.h> /* should be called last. */ static int probe_default(void) diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 28459cab3dd..4ba5ccaa158 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -16,7 +16,19 @@ #include <asm/es7000/apic.h> #include <asm/es7000/ipi.h> #include <asm/es7000/mpparse.h> -#include <asm/es7000/wakecpu.h> +#include <asm/mach-default/mach_wakecpu.h> + +void __init es7000_update_genapic_to_cluster(void) +{ + genapic->target_cpus = target_cpus_cluster; + genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; + genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; + genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; + + genapic->init_apic_ldr = init_apic_ldr_cluster; + + genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +} static int probe_es7000(void) { @@ -75,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -85,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 71a309b122e..511d7941364 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 5a7e4619e1c..c346d9d0226 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -15,6 +15,7 @@ #include <asm/mpspec.h> #include <asm/apicdef.h> #include <asm/genapic.h> +#include <asm/setup.h> extern struct genapic apic_numaq; extern struct genapic apic_summit; @@ -57,6 +58,9 @@ static int __init parse_apic(char *arg) } } + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); + /* Parsed again by __setup for debug/verbose */ return 0; } @@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void) * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ - if (!cmdline_apic && genapic == &apic_default) + if (!cmdline_apic && genapic == &apic_default) { if (apic_bigsmp.probe()) { genapic = &apic_bigsmp; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Overriding APIC driver with %s\n", genapic->name); } + } #endif } @@ -94,6 +101,9 @@ void __init generic_apic_probe(void) /* Not visible without early console */ if (!apic_probe[i]) panic("Didn't find an APIC driver"); + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } printk(KERN_INFO "Using APIC driver %s\n", genapic->name); } @@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem, if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) { if (!cmdline_apic) { genapic = apic_probe[i]; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } @@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { if (!cmdline_apic) { genapic = apic_probe[i]; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6272b5e69da..2821ffc188b 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -16,6 +16,7 @@ #include <asm/summit/apic.h> #include <asm/summit/ipi.h> #include <asm/summit/mpparse.h> +#include <asm/mach-default/mach_wakecpu.h> static int probe_summit(void) { @@ -23,7 +24,7 @@ static int probe_summit(void) return 0; } -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -33,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 52145007bd7..a5bc05492b1 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1; /* Used for the invalidate map that's also checked in the spinlock */ static volatile unsigned long smp_invalidate_needed; -/* Bitmask of currently online CPUs - used by setup.c for - /proc/cpuinfo, visible externally but still physical */ -cpumask_t cpu_online_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_online_map); - /* Bitmask of CPUs present in the system - exported by i386_syms.c, used * by scheduler but indexed physically */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; @@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* This is for the new dynamic CPU boot code */ cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; -cpumask_t cpu_possible_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_possible_map); /* The per processor IRQ masks (these are usually kept in sync) */ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; @@ -679,7 +672,7 @@ void __init smp_boot_cpus(void) /* loop over all the extended VIC CPUs and boot them. The * Quad CPUs must be bootstrapped by their extended VIC cpu */ - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) continue; do_boot_cpu(i); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fea4565ff57..d8cc96a2738 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-y := pf_in.o mmio-mod.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa_$(BITS).o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..21e996a70d6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -53,7 +53,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE_HOOKS +#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; @@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { unsigned long flags = oops_begin(); + int sig = SIGKILL; struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) - regs = NULL; - oops_end(flags, regs, SIGKILL); + sig = 0; + oops_end(flags, regs, sig); } #endif @@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) int fault; #ifdef CONFIG_X86_64 unsigned long flags; + int sig; #endif tsk = current; @@ -849,11 +851,12 @@ no_context: bust_spinlocks(0); do_exit(SIGKILL); #else + sig = SIGKILL; if (__die("Oops", regs, error_code)) - regs = NULL; + sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, SIGKILL); + oops_end(flags, regs, sig); #endif /* diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cebcbf152d4..71a14f89f89 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -278,7 +278,7 @@ void __init numa_init_array(void) int rr, i; rr = first_node(node_online_map); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); @@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) memnodemap[0] = 0; node_set_online(0); node_set(0, node_possible_map); - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); e820_register_active_regions(0, start_pfn, last_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 51c0a2fc14f..09737c8af07 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (!node_online(i)) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { int node = early_cpu_to_node(i); if (node == NUMA_NO_NODE) diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 9915293500f..9a5af6c8fbe 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, #undef PCI_CONF2_ADDRESS -static struct pci_raw_ops pci_direct_conf2 = { +struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; @@ -289,6 +289,7 @@ int __init pci_direct_probe(void) if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; + port_cf9_safe = true; return 1; } release_resource(region); @@ -305,6 +306,7 @@ int __init pci_direct_probe(void) if (pci_check_type2()) { raw_pci_ops = &pci_direct_conf2; + port_cf9_safe = true; return 2; } diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 15b9cf6be72..1959018aac0 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops; extern struct pci_raw_ops *raw_pci_ext_ops; extern struct pci_raw_ops pci_direct_conf1; +extern bool port_cf9_safe; /* arch_initcall level */ extern int pci_direct_probe(void); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 1ef0f90813d..d9d35824c56 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -9,6 +9,9 @@ * Also alternative() doesn't work. */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + #include <linux/kernel.h> #include <linux/posix-timers.h> #include <linux/time.h> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 636ef4caa52..e59e53b11e2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1079,7 +1079,7 @@ static void drop_other_mm_ref(void *info) static void xen_drop_mm_ref(struct mm_struct *mm) { - cpumask_t mask; + cpumask_var_t mask; unsigned cpu; if (current->active_mm == mm) { @@ -1091,7 +1091,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm) } /* Get the "official" set of cpus referring to our pagetable. */ - mask = mm->cpu_vm_mask; + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, &mm->cpu_vm_mask); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@ -1100,11 +1109,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) if needed. */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, mask); } - if (!cpus_empty(mask)) - smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index acd9b6705e0..c44e2069c7c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -33,7 +33,7 @@ #include "xen-ops.h" #include "mmu.h" -cpumask_t xen_cpu_initialized_map; +cpumask_var_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); static DEFINE_PER_CPU(int, callfunc_irq); @@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void) { int i, rc; - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { num_processors++; @@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) if (xen_smp_intr_init(0)) BUG(); - xen_cpu_initialized_map = cpumask_of_cpu(0); + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) + panic("could not allocate xen_cpu_initialized_map\n"); + + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); /* Restrict the possible_map according to max_cpus. */ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--) + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) continue; cpu_clear(cpu, cpu_possible_map); } @@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct vcpu_guest_context *ctxt; struct desc_struct *gdt; - if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); @@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } -static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +static void xen_send_IPI_mask(const struct cpumask *mask, + enum ipi_vector vector) { unsigned cpu; - cpus_and(mask, mask, cpu_online_map); - - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, mask, cpu_online_mask) xen_send_IPI_one(cpu, vector); } -static void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { if (xen_vcpu_stolen(cpu)) { HYPERVISOR_sched_op(SCHEDOP_yield, 0); break; @@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask) static void xen_smp_send_call_function_single_ipi(int cpu) { - xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); + xen_send_IPI_mask(cpumask_of(cpu), + XEN_CALL_FUNCTION_SINGLE_VECTOR); } static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 2a234db5949..212ffe012b7 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled) pfn_to_mfn(xen_start_info->console.domU.mfn); } else { #ifdef CONFIG_SMP - xen_cpu_initialized_map = cpu_online_map; + BUG_ON(xen_cpu_initialized_map == NULL); + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); #endif xen_vcpu_restore(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c9f7cda48ed..65d75a6be0b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -437,7 +437,7 @@ void xen_setup_timer(int cpu) evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); evt->irq = irq; setup_runstate_info(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e1afae8461..c1f8faf0a2c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void); __cpuinit void xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); -extern cpumask_t xen_cpu_initialized_map; +extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} #endif diff --git a/block/Kconfig b/block/Kconfig index 1ab7c15c8d7..290b219fad9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE depends on SYSFS select RELAY select DEBUG_FS + select TRACEPOINTS help Say Y here if you want to be able to trace the block layer actions on a given queue. Tracing allows you to see any traffic happening diff --git a/block/blk-core.c b/block/blk-core.c index c36aa98fafa..561e8a1b43a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -28,9 +28,23 @@ #include <linux/task_io_accounting_ops.h> #include <linux/blktrace_api.h> #include <linux/fault-inject.h> +#include <trace/block.h> #include "blk.h" +DEFINE_TRACE(block_plug); +DEFINE_TRACE(block_unplug_io); +DEFINE_TRACE(block_unplug_timer); +DEFINE_TRACE(block_getrq); +DEFINE_TRACE(block_sleeprq); +DEFINE_TRACE(block_rq_requeue); +DEFINE_TRACE(block_bio_backmerge); +DEFINE_TRACE(block_bio_frontmerge); +DEFINE_TRACE(block_bio_queue); +DEFINE_TRACE(block_rq_complete); +DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ +EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); + static int __make_request(struct request_queue *q, struct bio *bio); /* @@ -205,7 +219,7 @@ void blk_plug_device(struct request_queue *q) if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); + trace_block_plug(q); } } EXPORT_SYMBOL(blk_plug_device); @@ -292,9 +306,7 @@ void blk_unplug_work(struct work_struct *work) struct request_queue *q = container_of(work, struct request_queue, unplug_work); - blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, - q->rq.count[READ] + q->rq.count[WRITE]); - + trace_block_unplug_io(q); q->unplug_fn(q); } @@ -302,9 +314,7 @@ void blk_unplug_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; - blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, - q->rq.count[READ] + q->rq.count[WRITE]); - + trace_block_unplug_timer(q); kblockd_schedule_work(q, &q->unplug_work); } @@ -314,9 +324,7 @@ void blk_unplug(struct request_queue *q) * devices don't necessarily have an ->unplug_fn defined */ if (q->unplug_fn) { - blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, - q->rq.count[READ] + q->rq.count[WRITE]); - + trace_block_unplug_io(q); q->unplug_fn(q); } } @@ -822,7 +830,7 @@ rq_starved: if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; - blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); + trace_block_getrq(q, bio, rw); out: return rq; } @@ -848,7 +856,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); + trace_block_sleeprq(q, bio, rw); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); @@ -928,7 +936,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) { blk_delete_timer(rq); blk_clear_rq_complete(rq); - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + trace_block_rq_requeue(q, rq); if (blk_rq_tagged(rq)) blk_queue_end_tag(q, rq); @@ -1167,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!ll_back_merge_fn(q, req, bio)) break; - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + trace_block_bio_backmerge(q, bio); req->biotail->bi_next = bio; req->biotail = bio; @@ -1186,7 +1194,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!ll_front_merge_fn(q, req, bio)) break; - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + trace_block_bio_frontmerge(q, bio); bio->bi_next = req->bio; req->bio = bio; @@ -1269,7 +1277,7 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio, + trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, bdev->bd_dev, bio->bi_sector, bio->bi_sector - p->start_sect); } @@ -1441,10 +1449,10 @@ end_io: goto end_io; if (old_sector != -1) - blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, + trace_block_remap(q, bio, old_dev, bio->bi_sector, old_sector); - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + trace_block_bio_queue(q, bio); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; @@ -1678,7 +1686,7 @@ static int __end_that_request_first(struct request *req, int error, int total_bytes, bio_nbytes, next_idx = 0; struct bio *bio; - blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); + trace_block_rq_complete(req->q, req); /* * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual diff --git a/block/blktrace.c b/block/blktrace.c index 85049a7e7a1..b0a2cae886d 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -23,10 +23,18 @@ #include <linux/mutex.h> #include <linux/debugfs.h> #include <linux/time.h> +#include <trace/block.h> #include <asm/uaccess.h> static unsigned int blktrace_seq __read_mostly = 1; +/* Global reference count of probes */ +static DEFINE_MUTEX(blk_probe_mutex); +static atomic_t blk_probes_ref = ATOMIC_INIT(0); + +static int blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + /* * Send out a notify message. */ @@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK * The worker for the various blk_add_trace*() types. Fills out a * blk_io_trace structure and places it in a per-cpu subbuffer. */ -void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int rw, u32 what, int error, int pdu_len, void *pdu_data) { struct task_struct *tsk = current; @@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__blk_add_trace); - static struct dentry *blk_tree_root; static DEFINE_MUTEX(blk_tree_mutex); static unsigned int root_users; @@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt) free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); } int blk_trace_remove(struct request_queue *q) @@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) { + ret = blk_register_tracepoints(); + if (ret) + goto probe_err; + } + mutex_unlock(&blk_probe_mutex); + ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); if (old_bt) { @@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; +probe_err: + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); err: if (dir) blk_remove_tree(dir); @@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q) blk_trace_remove(q); } } + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @q: queue the io is for + * @rq: the source request + * @what: the action + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request_queue *q, struct request *rq, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + int rw = rq->cmd_flags & 0x03; + + if (likely(!bt)) + return; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) { + what |= BLK_TC_ACT(BLK_TC_PC); + __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, + sizeof(rq->cmd), rq->cmd); + } else { + what |= BLK_TC_ACT(BLK_TC_FS); + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + rw, what, rq->errors, 0, NULL); + } +} + +static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ABORT); +} + +static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_INSERT); +} + +static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +} + +static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +} + +static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, + !bio_flagged(bio, BIO_UPTODATE), 0, NULL); +} + +static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); +} + +static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); +} + +static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); +} + +static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); +} + +static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); +} + +static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + } +} + + +static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); + } +} + +static void blk_add_trace_plug(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +} + +static void blk_add_trace_unplug_io(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_unplug_timer(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_split(struct request_queue *q, struct bio *bio, + unsigned int pdu) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + sizeof(rpdu), &rpdu); + } +} + +/** + * blk_add_trace_remap - Add a trace for a remap operation + * @q: queue the io is for + * @bio: the source bio + * @dev: target device + * @from: source sector + * @to: target sector + * + * Description: + * Device mapper or raid target sometimes need to split a bio because + * it spans a stripe (or similar). Add a trace for that action. + * + **/ +static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from, sector_t to) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device = cpu_to_be32(dev); + r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector = cpu_to_be64(to); + + __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (blk_pc_request(rq)) + __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, + rq->errors, len, data); + else + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + 0, BLK_TA_DRV_DATA, rq->errors, len, data); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static int blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq); + WARN_ON(ret); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug); + WARN_ON(ret); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + WARN_ON(ret); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split); + WARN_ON(ret); + ret = register_trace_block_remap(blk_add_trace_remap); + WARN_ON(ret); + return 0; +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_remap(blk_add_trace_remap); + unregister_trace_block_split(blk_add_trace_split); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); + unregister_trace_block_plug(blk_add_trace_plug); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq); + unregister_trace_block_getrq(blk_add_trace_getrq); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + + tracepoint_synchronize_unregister(); +} diff --git a/block/elevator.c b/block/elevator.c index a6951f76ba0..86836dd179c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -33,6 +33,7 @@ #include <linux/compiler.h> #include <linux/delay.h> #include <linux/blktrace_api.h> +#include <trace/block.h> #include <linux/hash.h> #include <linux/uaccess.h> @@ -41,6 +42,8 @@ static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); +DEFINE_TRACE(block_rq_abort); + /* * Merge hash stuff. */ @@ -52,6 +55,9 @@ static const int elv_hash_shift = 6; #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +DEFINE_TRACE(block_rq_insert); +DEFINE_TRACE(block_rq_issue); + /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. @@ -586,7 +592,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) unsigned ordseq; int unplug_it = 1; - blk_add_trace_rq(q, rq, BLK_TA_INSERT); + trace_block_rq_insert(q, rq); rq->q = q; @@ -772,7 +778,7 @@ struct request *elv_next_request(struct request_queue *q) * not be passed by new incoming requests */ rq->cmd_flags |= REQ_STARTED; - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); + trace_block_rq_issue(q, rq); } if (!q->boundary_rq || q->boundary_rq == rq) { @@ -914,7 +920,7 @@ void elv_abort_queue(struct request_queue *q) while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); rq->cmd_flags |= REQ_QUIET; - blk_add_trace_rq(q, rq, BLK_TA_ABORT); + trace_block_rq_abort(q, rq); __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); } } diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 64f5d54f7ed..4259072f5bd 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -109,7 +109,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL); */ static ssize_t print_cpus_map(char *buf, cpumask_t *map) { - int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map); + int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map); buf[n++] = '\n'; buf[n] = '\0'; diff --git a/drivers/base/node.c b/drivers/base/node.c index f5207090885..91636cd8b6c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -30,8 +30,8 @@ static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf) BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1)); len = type? - cpulist_scnprintf(buf, PAGE_SIZE-2, *mask): - cpumask_scnprintf(buf, PAGE_SIZE-2, *mask); + cpulist_scnprintf(buf, PAGE_SIZE-2, mask) : + cpumask_scnprintf(buf, PAGE_SIZE-2, mask); buf[len++] = '\n'; buf[len] = '\0'; return len; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 199cd97e32e..a8bc1cbcfa7 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -49,8 +49,8 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) if (len > 1) { n = type? - cpulist_scnprintf(buf, len-2, *mask): - cpumask_scnprintf(buf, len-2, *mask); + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; buf[n] = '\0'; } diff --git a/drivers/char/random.c b/drivers/char/random.c index 675076f5fca..d26891bfcd4 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,23 +558,9 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -static struct timer_rand_state *irq_timer_state[NR_IRQS]; - -static struct timer_rand_state *get_timer_rand_state(unsigned int irq) -{ - if (irq >= nr_irqs) - return NULL; - - return irq_timer_state[irq]; -} - -static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) -{ - if (irq >= nr_irqs) - return; - - irq_timer_state[irq] = state; -} +#ifndef CONFIG_SPARSE_IRQ +struct timer_rand_state *irq_timer_state[NR_IRQS]; +#endif static struct timer_rand_state input_timer_state; @@ -933,8 +919,10 @@ void rand_initialize_irq(int irq) { struct timer_rand_state *state; +#ifndef CONFIG_SPARSE_IRQ if (irq >= nr_irqs) return; +#endif state = get_timer_rand_state(irq); diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index ce0d9da52a8..94966edfb44 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; +#ifdef CONFIG_TRACING +#include <linux/ftrace.h> + +static void sysrq_ftrace_dump(int key, struct tty_struct *tty) +{ + ftrace_dump(); +} +static struct sysrq_key_op sysrq_ftrace_dump_op = { + .handler = sysrq_ftrace_dump, + .help_msg = "dumpZ-ftrace-buffer", + .action_msg = "Dump ftrace buffer", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; +#else +#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0) +#endif static void sysrq_handle_showmem(int key, struct tty_struct *tty) { @@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = { NULL, /* x */ /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ - NULL /* z */ + &sysrq_ftrace_dump_op, /* z */ }; /* key2index calculation, -1 on invalid index */ diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c index f450588e585..254f1064d97 100644 --- a/drivers/clocksource/tcb_clksrc.c +++ b/drivers/clocksource/tcb_clksrc.c @@ -154,7 +154,6 @@ static struct tc_clkevt_device clkevt = { .shift = 32, /* Should be lower than at91rm9200's system timer */ .rating = 125, - .cpumask = CPU_MASK_CPU0, .set_next_event = tc_next_event, .set_mode = tc_mode, }, @@ -195,6 +194,7 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx) clkevt.clkevt.max_delta_ns = clockevent_delta2ns(0xffff, &clkevt.clkevt); clkevt.clkevt.min_delta_ns = clockevent_delta2ns(1, &clkevt.clkevt) + 1; + clkevt.clkevt.cpumask = cpumask_of(0); setup_irq(irq, &tc_irqaction); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c99e4728ff4..343094c3fee 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -21,6 +21,7 @@ #include <linux/idr.h> #include <linux/hdreg.h> #include <linux/blktrace_api.h> +#include <trace/block.h> #define DM_MSG_PREFIX "core" @@ -51,6 +52,8 @@ struct dm_target_io { union map_info info; }; +DEFINE_TRACE(block_bio_complete); + union map_info *dm_get_mapinfo(struct bio *bio) { if (bio && bio->bi_private) @@ -504,8 +507,7 @@ static void dec_pending(struct dm_io *io, int error) end_io_acct(io); if (io->error != DM_ENDIO_REQUEUE) { - blk_add_trace_bio(io->md->queue, io->bio, - BLK_TA_COMPLETE); + trace_block_bio_complete(io->md->queue, io->bio); bio_endio(io->bio, io->error); } @@ -598,7 +600,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ - blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, + trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, tio->io->bio->bi_bdev->bd_dev, clone->bi_sector, sector); diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 7beffcab274..9dedbbd218c 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -704,16 +704,17 @@ static unsigned int iosapic_startup_irq(unsigned int irq) } #ifdef CONFIG_SMP -static void iosapic_set_affinity_irq(unsigned int irq, cpumask_t dest) +static void iosapic_set_affinity_irq(unsigned int irq, + const struct cpumask *dest) { struct vector_info *vi = iosapic_get_vector(irq); u32 d0, d1, dummy_d0; unsigned long flags; - if (cpu_check_affinity(irq, &dest)) + if (cpu_check_affinity(irq, dest)) return; - vi->txn_addr = txn_affinity_addr(irq, first_cpu(dest)); + vi->txn_addr = txn_affinity_addr(irq, cpumask_first(dest)); spin_lock_irqsave(&iosapic_lock, flags); /* d1 contains the destination CPU, so only want to set that diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 2de5a3238c9..f78371b2252 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -5,6 +5,7 @@ #include <linux/pci.h> #include <linux/irq.h> #include <asm/io_apic.h> +#include <asm/smp.h> #include <linux/intel-iommu.h> #include "intr_remapping.h" @@ -19,17 +20,75 @@ struct irq_2_iommu { u8 irte_mask; }; -static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; +#ifdef CONFIG_SPARSE_IRQ +static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu) +{ + struct irq_2_iommu *iommu; + int node; + + node = cpu_to_node(cpu); + + iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node); + printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node); + + return iommu; +} static struct irq_2_iommu *irq_2_iommu(unsigned int irq) { - return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (WARN_ON_ONCE(!desc)) + return NULL; + + return desc->irq_2_iommu; +} + +static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc; + struct irq_2_iommu *irq_iommu; + + /* + * alloc irq desc if not allocated already. + */ + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return NULL; + } + + irq_iommu = desc->irq_2_iommu; + + if (!irq_iommu) + desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu); + + return desc->irq_2_iommu; } static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { + return irq_2_iommu_alloc_cpu(irq, boot_cpu_id); +} + +#else /* !CONFIG_SPARSE_IRQ */ + +static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; + +static struct irq_2_iommu *irq_2_iommu(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_2_iommuX[irq]; + + return NULL; +} +static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) +{ return irq_2_iommu(irq); } +#endif static DEFINE_SPINLOCK(irq_2_ir_lock); @@ -86,9 +145,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) if (!count) return -1; +#ifndef CONFIG_SPARSE_IRQ /* protect irq_2_iommu_alloc later */ if (irq >= nr_irqs) return -1; +#endif /* * start the IRTE search from index 0. @@ -130,6 +191,12 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) table->base[i].present = 1; irq_iommu = irq_2_iommu_alloc(irq); + if (!irq_iommu) { + spin_unlock(&irq_2_ir_lock); + printk(KERN_ERR "can't allocate irq_2_iommu\n"); + return -1; + } + irq_iommu->iommu = iommu; irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; @@ -177,6 +244,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) irq_iommu = irq_2_iommu_alloc(irq); + if (!irq_iommu) { + spin_unlock(&irq_2_ir_lock); + printk(KERN_ERR "can't allocate irq_2_iommu\n"); + return -1; + } + irq_iommu->iommu = iommu; irq_iommu->irte_index = index; irq_iommu->sub_handle = subhandle; diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 74801f7df9c..11a51f8ed3b 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_dev *dev, int enable) } } -static void msix_flush_writes(unsigned int irq) +static void msix_flush_writes(struct irq_desc *desc) { struct msi_desc *entry; - entry = get_irq_msi(irq); + entry = get_irq_desc_msi(desc); BUG_ON(!entry || !entry->dev); switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: @@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned int irq) * Returns 1 if it succeeded in masking the interrupt and 0 if the device * doesn't support MSI masking. */ -static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag) +static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag) { struct msi_desc *entry; - entry = get_irq_msi(irq); + entry = get_irq_desc_msi(desc); BUG_ON(!entry || !entry->dev); switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: @@ -172,9 +172,9 @@ static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag) return 1; } -void read_msi_msg(unsigned int irq, struct msi_msg *msg) +void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) { - struct msi_desc *entry = get_irq_msi(irq); + struct msi_desc *entry = get_irq_desc_msi(desc); switch(entry->msi_attrib.type) { case PCI_CAP_ID_MSI: { @@ -211,9 +211,16 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg) } } -void write_msi_msg(unsigned int irq, struct msi_msg *msg) +void read_msi_msg(unsigned int irq, struct msi_msg *msg) { - struct msi_desc *entry = get_irq_msi(irq); + struct irq_desc *desc = irq_to_desc(irq); + + read_msi_msg_desc(desc, msg); +} + +void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) +{ + struct msi_desc *entry = get_irq_desc_msi(desc); switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: { @@ -252,21 +259,31 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg) entry->msg = *msg; } +void write_msi_msg(unsigned int irq, struct msi_msg *msg) +{ + struct irq_desc *desc = irq_to_desc(irq); + + write_msi_msg_desc(desc, msg); +} + void mask_msi_irq(unsigned int irq) { - msi_set_mask_bits(irq, 1, 1); - msix_flush_writes(irq); + struct irq_desc *desc = irq_to_desc(irq); + + msi_set_mask_bits(desc, 1, 1); + msix_flush_writes(desc); } void unmask_msi_irq(unsigned int irq) { - msi_set_mask_bits(irq, 1, 0); - msix_flush_writes(irq); + struct irq_desc *desc = irq_to_desc(irq); + + msi_set_mask_bits(desc, 1, 0); + msix_flush_writes(desc); } static int msi_free_irqs(struct pci_dev* dev); - static struct msi_desc* alloc_msi_entry(void) { struct msi_desc *entry; @@ -303,9 +320,11 @@ static void __pci_restore_msi_state(struct pci_dev *dev) pci_intx_for_msi(dev, 0); msi_set_enable(dev, 0); write_msi_msg(dev->irq, &entry->msg); - if (entry->msi_attrib.maskbit) - msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask, + if (entry->msi_attrib.maskbit) { + struct irq_desc *desc = irq_to_desc(dev->irq); + msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask, entry->msi_attrib.masked); + } pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); control &= ~PCI_MSI_FLAGS_QSIZE; @@ -327,8 +346,9 @@ static void __pci_restore_msix_state(struct pci_dev *dev) msix_set_enable(dev, 0); list_for_each_entry(entry, &dev->msi_list, list) { + struct irq_desc *desc = irq_to_desc(entry->irq); write_msi_msg(entry->irq, &entry->msg); - msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked); + msi_set_mask_bits(desc, 1, entry->msi_attrib.masked); } BUG_ON(list_empty(&dev->msi_list)); @@ -596,7 +616,8 @@ void pci_msi_shutdown(struct pci_dev* dev) /* Return the the pci reset with msi irqs unmasked */ if (entry->msi_attrib.maskbit) { u32 mask = entry->msi_attrib.maskbits_mask; - msi_set_mask_bits(dev->irq, mask, ~mask); + struct irq_desc *desc = irq_to_desc(dev->irq); + msi_set_mask_bits(desc, mask, ~mask); } if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) return; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5d72866897a..c88485860a0 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -74,7 +74,7 @@ static ssize_t local_cpus_show(struct device *dev, int len; mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); - len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask); + len = cpumask_scnprintf(buf, PAGE_SIZE-2, &mask); buf[len++] = '\n'; buf[len] = '\0'; return len; @@ -88,7 +88,7 @@ static ssize_t local_cpulist_show(struct device *dev, int len; mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); - len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask); + len = cpulist_scnprintf(buf, PAGE_SIZE-2, &mask); buf[len++] = '\n'; buf[len] = '\0'; return len; diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 003a9b3c293..5b3f5937ecf 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -55,8 +55,8 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev, cpumask = pcibus_to_cpumask(to_pci_bus(dev)); ret = type? - cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask): - cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask); + cpulist_scnprintf(buf, PAGE_SIZE-2, &cpumask) : + cpumask_scnprintf(buf, PAGE_SIZE-2, &cpumask); buf[ret++] = '\n'; buf[ret] = '\0'; return ret; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 1e3b934a4cf..6c8193046e0 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -141,8 +141,12 @@ static void init_evtchn_cpu_bindings(void) int i; /* By default all event channels notify CPU#0. */ - for_each_irq_desc(i, desc) + for_each_irq_desc(i, desc) { + if (!desc) + continue; + desc->affinity = cpumask_of_cpu(0); + } #endif memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); @@ -231,7 +235,7 @@ static int find_unbound_irq(void) int irq; /* Only allocate from dynirq range */ - for_each_irq_nr(irq) + for (irq = 0; irq < nr_irqs; irq++) if (irq_bindcount[irq] == 0) break; @@ -579,7 +583,7 @@ void rebind_evtchn_irq(int evtchn, int irq) spin_unlock(&irq_mapping_update_lock); /* new event channels are always bound to cpu 0 */ - irq_set_affinity(irq, cpumask_of_cpu(0)); + irq_set_affinity(irq, cpumask_of(0)); /* Unmask the event channel. */ enable_irq(irq); @@ -608,9 +612,9 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) } -static void set_affinity_irq(unsigned irq, cpumask_t dest) +static void set_affinity_irq(unsigned irq, const struct cpumask *dest) { - unsigned tcpu = first_cpu(dest); + unsigned tcpu = cpumask_first(dest); rebind_irq_to_cpu(irq, tcpu); } @@ -792,7 +796,7 @@ void xen_irq_resume(void) mask_evtchn(evtchn); /* No IRQ <-> event-channel mappings. */ - for_each_irq_nr(irq) + for (irq = 0; irq < nr_irqs; irq++) irq_info[irq].evtchn = 0; /* zap event-channel binding */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) @@ -824,7 +828,7 @@ void __init xen_init_IRQ(void) mask_evtchn(i); /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for_each_irq_nr(i) + for (i = 0; i < nr_irqs; i++) irq_bindcount[i] = 0; irq_ctx_init(smp_processor_id()); @@ -26,8 +26,11 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/blktrace_api.h> +#include <trace/block.h> #include <scsi/sg.h> /* for struct sg_iovec */ +DEFINE_TRACE(block_split); + static struct kmem_cache *bio_slab __read_mostly; static mempool_t *bio_split_pool __read_mostly; @@ -1263,7 +1266,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) if (!bp) return bp; - blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, + trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); BUG_ON(bi->bi_vcnt != 1); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 81904f07679..3bb1cf1e742 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -44,10 +44,13 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - - for_each_irq_nr(j) + for_each_irq_nr(j) { +#ifdef CONFIG_SPARSE_IRQ + if (!irq_to_desc(j)) + continue; +#endif sum += kstat_irqs_cpu(j, i); - + } sum += arch_irq_stat_cpu(i); } sum += arch_irq_stat(); @@ -92,7 +95,12 @@ static int show_stat(struct seq_file *p, void *v) /* sum again ? it could be updated? */ for_each_irq_nr(j) { per_irq_sum = 0; - +#ifdef CONFIG_SPARSE_IRQ + if (!irq_to_desc(j)) { + seq_printf(p, " %u", per_irq_sum); + continue; + } +#endif for_each_possible_cpu(i) per_irq_sum += kstat_irqs_cpu(j, i); diff --git a/fs/seq_file.c b/fs/seq_file.c index eba2eabcd2b..16c211558c2 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...) } EXPORT_SYMBOL(seq_printf); -static char *mangle_path(char *s, char *p, char *esc) +/** + * mangle_path - mangle and copy path to buffer beginning + * @s: buffer start + * @p: beginning of path in above buffer + * @esc: set of characters that need escaping + * + * Copy the path from @p to @s, replacing each occurrence of character from + * @esc with usual octal escape. + * Returns pointer past last written character in @s, or NULL in case of + * failure. + */ +char *mangle_path(char *s, char *p, char *esc) { while (s <= p) { char c = *p++; @@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc) } return NULL; } +EXPORT_SYMBOL(mangle_path); /* * return the absolute path of 'dentry' residing in mount 'mnt'. diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 54bbf6e04ee..0e9e2bc0ee9 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -40,6 +40,9 @@ #ifndef node_to_cpumask #define node_to_cpumask(node) ((void)node, cpu_online_map) #endif +#ifndef cpumask_of_node +#define cpumask_of_node(node) ((void)node, cpu_online_mask) +#endif #ifndef node_to_first_cpu #define node_to_first_cpu(node) ((void)(node),0) #endif @@ -54,9 +57,18 @@ ) #endif +#ifndef cpumask_of_pcibus +#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ + cpu_all_mask : \ + cpumask_of_node(pcibus_to_node(bus))) +#endif + #endif /* CONFIG_NUMA */ -/* returns pointer to cpumask for specified node */ +/* + * returns pointer to cpumask for specified node + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #ifndef node_to_cpumask_ptr #define node_to_cpumask_ptr(v, node) \ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 80744606bad..eba835a2c2c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -45,6 +45,22 @@ #define MCOUNT_REC() #endif +#ifdef CONFIG_TRACE_BRANCH_PROFILING +#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \ + *(_ftrace_annotated_branch) \ + VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .; +#else +#define LIKELY_PROFILE() +#endif + +#ifdef CONFIG_PROFILE_ALL_BRANCHES +#define BRANCH_PROFILE() VMLINUX_SYMBOL(__start_branch_profile) = .; \ + *(_ftrace_branch) \ + VMLINUX_SYMBOL(__stop_branch_profile) = .; +#else +#define BRANCH_PROFILE() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -60,9 +76,12 @@ VMLINUX_SYMBOL(__start___markers) = .; \ *(__markers) \ VMLINUX_SYMBOL(__stop___markers) = .; \ + . = ALIGN(32); \ VMLINUX_SYMBOL(__start___tracepoints) = .; \ *(__tracepoints) \ - VMLINUX_SYMBOL(__stop___tracepoints) = .; + VMLINUX_SYMBOL(__stop___tracepoints) = .; \ + LIKELY_PROFILE() \ + BRANCH_PROFILE() #define RO_DATA(align) \ . = ALIGN((align)); \ diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h index c5dd6691669..b96a6d2ffbc 100644 --- a/include/asm-m32r/smp.h +++ b/include/asm-m32r/smp.h @@ -63,8 +63,6 @@ extern volatile int cpu_2_physid[NR_CPUS]; #define raw_smp_processor_id() (current_thread_info()->cpu) extern cpumask_t cpu_callout_map; -extern cpumask_t cpu_possible_map; -extern cpumask_t cpu_present_map; static __inline__ int hard_smp_processor_id(void) { diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h index 70a57c8c002..c980f5ba8de 100644 --- a/include/asm-m32r/system.h +++ b/include/asm-m32r/system.h @@ -23,7 +23,7 @@ */ #if defined(CONFIG_FRAME_POINTER) || \ - !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER) + !defined(CONFIG_SCHED_OMIT_FRAME_POINTER) #define M32R_PUSH_FP " push fp\n" #define M32R_POP_FP " pop fp\n" #else diff --git a/include/linux/Kbuild b/include/linux/Kbuild index e531783e5d7..95ac82340c3 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -313,6 +313,7 @@ unifdef-y += ptrace.h unifdef-y += qnx4_fs.h unifdef-y += quota.h unifdef-y += random.h +unifdef-y += irqnr.h unifdef-y += reboot.h unifdef-y += reiserfs_fs.h unifdef-y += reiserfs_xattr.h diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index bdf505d33e7..1dba3493d52 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -160,7 +160,6 @@ struct blk_trace { extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_user_trace_setup *buts); extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); @@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); } while (0) #define BLK_TN_MAX_MSG 128 -/** - * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for - * @rq: the source request - * @what: the action - * - * Description: - * Records an action against a request. Will log the bio offset + size. - * - **/ -static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; - - if (likely(!bt)) - return; - - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); - - if (blk_pc_request(rq)) { - what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); - } else { - what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL); - } -} - -/** - * blk_add_trace_bio - Add a trace for a bio oriented action - * @q: queue the io is for - * @bio: the source bio - * @what: the action - * - * Description: - * Records an action against a bio. Will log the bio offset + size. - * - **/ -static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); -} - -/** - * blk_add_trace_generic - Add a trace for a generic action - * @q: queue the io is for - * @bio: the source bio - * @rw: the data direction - * @what: the action - * - * Description: - * Records a simple trace - * - **/ -static inline void blk_add_trace_generic(struct request_queue *q, - struct bio *bio, int rw, u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (bio) - blk_add_trace_bio(q, bio, what); - else - __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL); -} - -/** - * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload - * @q: queue the io is for - * @what: the action - * @bio: the source bio - * @pdu: the integer payload - * - * Description: - * Adds a trace with some integer payload. This might be an unplug - * option given as the action, with the depth at unplug time given - * as the payload - * - **/ -static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what, - struct bio *bio, unsigned int pdu) -{ - struct blk_trace *bt = q->blk_trace; - __be64 rpdu = cpu_to_be64(pdu); - - if (likely(!bt)) - return; - - if (bio) - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); - else - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); -} - -/** - * blk_add_trace_remap - Add a trace for a remap operation - * @q: queue the io is for - * @bio: the source bio - * @dev: target device - * @from: source sector - * @to: target sector - * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * - **/ -static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); - - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); -} - -/** - * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for - * @rq: io request - * @data: driver-specific data - * @len: length of driver-specific data - * - * Description: - * Some drivers might want to write driver-specific data per request. - * - **/ -static inline void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); - else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); -} - +extern void blk_add_driver_data(struct request_queue *q, struct request *rq, + void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); @@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q); #else /* !CONFIG_BLK_DEV_IO_TRACE */ #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) #define blk_trace_shutdown(q) do { } while (0) -#define blk_add_trace_rq(q, rq, what) do { } while (0) -#define blk_add_trace_bio(q, rq, what) do { } while (0) -#define blk_add_trace_generic(q, rq, rw, what) do { } while (0) -#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) -#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) -#define blk_add_driver_data(q, rq, data, len) do {} while (0) #define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) +#define blk_add_driver_data(q, rq, data, len) do {} while (0) #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) #define blk_trace_startstop(q, start) (-ENOTTY) #define blk_trace_remove(q) (-ENOTTY) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index ed3a5d473e5..cea153697ec 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -82,13 +82,13 @@ struct clock_event_device { int shift; int rating; int irq; - cpumask_t cpumask; + const struct cpumask *cpumask; int (*set_next_event)(unsigned long evt, struct clock_event_device *); void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); void (*event_handler)(struct clock_event_device *); - void (*broadcast)(cpumask_t mask); + void (*broadcast)(const struct cpumask *mask); struct list_head list; enum clock_event_mode mode; ktime_t next_event; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 98115d9d04d..ea7c6be354b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -59,8 +59,88 @@ extern void __chk_io_ptr(const volatile void __iomem *); * specific implementations come from the above header files */ -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) +struct ftrace_branch_data { + const char *func; + const char *file; + unsigned line; + union { + struct { + unsigned long correct; + unsigned long incorrect; + }; + struct { + unsigned long miss; + unsigned long hit; + }; + }; +}; + +/* + * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code + * to disable branch tracing on a per file basis. + */ +#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING) +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); + +#define likely_notrace(x) __builtin_expect(!!(x), 1) +#define unlikely_notrace(x) __builtin_expect(!!(x), 0) + +#define __branch_check__(x, expect) ({ \ + int ______r; \ + static struct ftrace_branch_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_annotated_branch"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______r = likely_notrace(x); \ + ftrace_likely_update(&______f, ______r, expect); \ + ______r; \ + }) + +/* + * Using __builtin_constant_p(x) to ignore cases where the return + * value is always the same. This idea is taken from a similar patch + * written by Daniel Walker. + */ +# ifndef likely +# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1)) +# endif +# ifndef unlikely +# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) +# endif + +#ifdef CONFIG_PROFILE_ALL_BRANCHES +/* + * "Define 'is'", Bill Clinton + * "Define 'if'", Steven Rostedt + */ +#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) : \ + ({ \ + int ______r; \ + static struct ftrace_branch_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_branch"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______r = !!(cond); \ + if (______r) \ + ______f.hit++; \ + else \ + ______f.miss++; \ + ______r; \ + })) +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ + +#else +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* Optimization barrier */ #ifndef barrier diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 21e1dd43e52..d4bf52603e6 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -339,36 +339,6 @@ extern cpumask_t cpu_mask_all; #endif #define CPUMASK_PTR(v, m) cpumask_t *v = &(m->v) -#define cpumask_scnprintf(buf, len, src) \ - __cpumask_scnprintf((buf), (len), &(src), NR_CPUS) -static inline int __cpumask_scnprintf(char *buf, int len, - const cpumask_t *srcp, int nbits) -{ - return bitmap_scnprintf(buf, len, srcp->bits, nbits); -} - -#define cpumask_parse_user(ubuf, ulen, dst) \ - __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS) -static inline int __cpumask_parse_user(const char __user *buf, int len, - cpumask_t *dstp, int nbits) -{ - return bitmap_parse_user(buf, len, dstp->bits, nbits); -} - -#define cpulist_scnprintf(buf, len, src) \ - __cpulist_scnprintf((buf), (len), &(src), NR_CPUS) -static inline int __cpulist_scnprintf(char *buf, int len, - const cpumask_t *srcp, int nbits) -{ - return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); -} - -#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS) -static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits) -{ - return bitmap_parselist(buf, dstp->bits, nbits); -} - #define cpu_remap(oldbit, old, new) \ __cpu_remap((oldbit), &(old), &(new), NR_CPUS) static inline int __cpu_remap(int oldbit, @@ -540,9 +510,6 @@ extern cpumask_t cpu_active_map; [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ } -/* This produces more efficient code. */ -#define nr_cpumask_bits NR_CPUS - #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ @@ -550,9 +517,15 @@ extern cpumask_t cpu_active_map; [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ } +#endif /* NR_CPUS > BITS_PER_LONG */ +#ifdef CONFIG_CPUMASK_OFFSTACK +/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, + * not all bits may be allocated. */ #define nr_cpumask_bits nr_cpu_ids -#endif /* NR_CPUS > BITS_PER_LONG */ +#else +#define nr_cpumask_bits NR_CPUS +#endif /* verify cpu argument to cpumask_* operators */ static inline unsigned int cpumask_check(unsigned int cpu) @@ -946,6 +919,63 @@ static inline void cpumask_copy(struct cpumask *dstp, #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** + * cpumask_scnprintf - print a cpumask into a string as comma-separated hex + * @buf: the buffer to sprintf into + * @len: the length of the buffer + * @srcp: the cpumask to print + * + * If len is zero, returns zero. Otherwise returns the length of the + * (nul-terminated) @buf string. + */ +static inline int cpumask_scnprintf(char *buf, int len, + const struct cpumask *srcp) +{ + return bitmap_scnprintf(buf, len, srcp->bits, nr_cpumask_bits); +} + +/** + * cpumask_parse_user - extract a cpumask from a user string + * @buf: the buffer to extract from + * @len: the length of the buffer + * @dstp: the cpumask to set. + * + * Returns -errno, or 0 for success. + */ +static inline int cpumask_parse_user(const char __user *buf, int len, + struct cpumask *dstp) +{ + return bitmap_parse_user(buf, len, dstp->bits, nr_cpumask_bits); +} + +/** + * cpulist_scnprintf - print a cpumask into a string as comma-separated list + * @buf: the buffer to sprintf into + * @len: the length of the buffer + * @srcp: the cpumask to print + * + * If len is zero, returns zero. Otherwise returns the length of the + * (nul-terminated) @buf string. + */ +static inline int cpulist_scnprintf(char *buf, int len, + const struct cpumask *srcp) +{ + return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpumask_bits); +} + +/** + * cpulist_parse_user - extract a cpumask from a user string of ranges + * @buf: the buffer to extract from + * @len: the length of the buffer + * @dstp: the cpumask to set. + * + * Returns -errno, or 0 for success. + */ +static inline int cpulist_parse(const char *buf, struct cpumask *dstp) +{ + return bitmap_parselist(buf, dstp->bits, nr_cpumask_bits); +} + +/** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 4aaa4afb1cb..096476f1fb3 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -17,7 +17,7 @@ extern int debug_locks_off(void); ({ \ int __ret = 0; \ \ - if (unlikely(c)) { \ + if (!oops_in_progress && unlikely(c)) { \ if (debug_locks_off() && !debug_locks_silent) \ WARN_ON(1); \ __ret = 1; \ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9c5bc6be2b0..985b28dc2ba 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -8,6 +8,8 @@ #include <linux/types.h> #include <linux/module.h> #include <linux/kallsyms.h> +#include <linux/bitops.h> +#include <linux/sched.h> #ifdef CONFIG_FUNCTION_TRACER @@ -24,6 +26,45 @@ struct ftrace_ops { struct ftrace_ops *next; }; +extern int function_trace_stop; + +/* + * Type of the current tracing. + */ +enum ftrace_tracing_type_t { + FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */ + FTRACE_TYPE_RETURN, /* Hook the return of the function */ +}; + +/* Current tracing type, default is FTRACE_TYPE_ENTER */ +extern enum ftrace_tracing_type_t ftrace_tracing_type; + +/** + * ftrace_stop - stop function tracer. + * + * A quick way to stop the function tracer. Note this an on off switch, + * it is not something that is recursive like preempt_disable. + * This does not disable the calling of mcount, it only stops the + * calling of functions from mcount. + */ +static inline void ftrace_stop(void) +{ + function_trace_stop = 1; +} + +/** + * ftrace_start - start the function tracer. + * + * This function is the inverse of ftrace_stop. This does not enable + * the function tracing if the function tracer is disabled. This only + * sets the function tracer flag to continue calling the functions + * from mcount. + */ +static inline void ftrace_start(void) +{ + function_trace_stop = 0; +} + /* * The ftrace_ops must be a static and should also * be read_mostly. These functions do modify read_mostly variables @@ -42,9 +83,13 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1); # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) static inline void ftrace_kill(void) { } +static inline void ftrace_stop(void) { } +static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE +/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ +#include <asm/ftrace.h> enum { FTRACE_FL_FREE = (1 << 0), @@ -60,6 +105,7 @@ struct dyn_ftrace { struct list_head list; unsigned long ip; /* address of mcount call-site */ unsigned long flags; + struct dyn_arch_ftrace arch; }; int ftrace_force_update(void); @@ -67,19 +113,25 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); -extern unsigned char *ftrace_nop_replace(void); -extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr); extern int ftrace_dyn_arch_init(void *data); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern void ftrace_graph_caller(void); +extern int ftrace_enable_ftrace_graph_caller(void); +extern int ftrace_disable_ftrace_graph_caller(void); +#else +static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; } +static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } +#endif /** - * ftrace_modify_code - modify code segment - * @ip: the address of the code segment - * @old_code: the contents of what is expected to be there - * @new_code: the code to patch in + * ftrace_make_nop - convert code into top + * @mod: module structure if called by module load initialization + * @rec: the mcount call site record + * @addr: the address that the call site should be calling * * This is a very sensitive operation and great care needs * to be taken by the arch. The operation should carefully @@ -87,6 +139,8 @@ extern void mcount_call(void); * what we expect it to be, and then on success of the compare, * it should write to the location. * + * The code segment at @rec->ip should be a caller to @addr + * * Return must be: * 0 on success * -EFAULT on error reading the location @@ -94,8 +148,34 @@ extern void mcount_call(void); * -EPERM on error writing to the location * Any other value will be considered a failure. */ -extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, - unsigned char *new_code); +extern int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr); + +/** + * ftrace_make_call - convert a nop call site into a call to addr + * @rec: the mcount call site record + * @addr: the address that the call site should call + * + * This is a very sensitive operation and great care needs + * to be taken by the arch. The operation should carefully + * read the location, check to see if what is read is indeed + * what we expect it to be, and then on success of the compare, + * it should write to the location. + * + * The code segment at @rec->ip should be a nop + * + * Return must be: + * 0 on success + * -EFAULT on error reading the location + * -EINVAL on a failed compare of the contents + * -EPERM on error writing to the location + * Any other value will be considered a failure. + */ +extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); + + +/* May be defined in arch */ +extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); @@ -103,7 +183,6 @@ extern void ftrace_release(void *start, unsigned long size); extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); - #else # define skip_trace(ip) ({ 0; }) # define ftrace_force_update() ({ 0; }) @@ -182,6 +261,12 @@ static inline void __ftrace_enabled_restore(int enabled) #endif #ifdef CONFIG_TRACING +extern int ftrace_dump_on_oops; + +extern void tracing_start(void); +extern void tracing_stop(void); +extern void ftrace_off_permanent(void); + extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); @@ -212,6 +297,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); +static inline void tracing_start(void) { } +static inline void tracing_stop(void) { } +static inline void ftrace_off_permanent(void) { } static inline int ftrace_printk(const char *fmt, ...) { @@ -222,33 +310,167 @@ static inline void ftrace_dump(void) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); -extern void ftrace_init_module(unsigned long *start, unsigned long *end); +extern void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } static inline void -ftrace_init_module(unsigned long *start, unsigned long *end) { } +ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { } #endif +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; + +struct power_trace { +#ifdef CONFIG_POWER_TRACER + ktime_t stamp; + ktime_t end; + int type; + int state; +#endif +}; + +#ifdef CONFIG_POWER_TRACER +extern void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_end(struct power_trace *it); +#else +static inline void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_end(struct power_trace *it) { } +#endif + + +/* + * Structure that defines an entry function trace. + */ +struct ftrace_graph_ent { + unsigned long func; /* Current function */ + int depth; +}; -struct boot_trace { - pid_t caller; - char func[KSYM_SYMBOL_LEN]; - int result; - unsigned long long duration; /* usecs */ - ktime_t calltime; - ktime_t rettime; +/* + * Structure that defines a return function trace. + */ +struct ftrace_graph_ret { + unsigned long func; /* Current function */ + unsigned long long calltime; + unsigned long long rettime; + /* Number of functions that overran the depth limit for current task */ + unsigned long overrun; + int depth; }; -#ifdef CONFIG_BOOT_TRACER -extern void trace_boot(struct boot_trace *it, initcall_t fn); -extern void start_boot_trace(void); -extern void stop_boot_trace(void); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* + * Sometimes we don't want to trace a function with the function + * graph tracer but we want them to keep traced by the usual function + * tracer if the function graph tracer is not configured. + */ +#define __notrace_funcgraph notrace + +#define FTRACE_RETFUNC_DEPTH 50 +#define FTRACE_RETSTACK_ALLOC_SIZE 32 +/* Type of the callback handlers for tracing function graph*/ +typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ +typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ + +extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc); + +extern void ftrace_graph_stop(void); + +/* The current handlers in use */ +extern trace_func_graph_ret_t ftrace_graph_return; +extern trace_func_graph_ent_t ftrace_graph_entry; + +extern void unregister_ftrace_graph(void); + +extern void ftrace_graph_init_task(struct task_struct *t); +extern void ftrace_graph_exit_task(struct task_struct *t); + +static inline int task_curr_ret_stack(struct task_struct *t) +{ + return t->curr_ret_stack; +} + +static inline void pause_graph_tracing(void) +{ + atomic_inc(¤t->tracing_graph_pause); +} + +static inline void unpause_graph_tracing(void) +{ + atomic_dec(¤t->tracing_graph_pause); +} #else -static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } -static inline void start_boot_trace(void) { } -static inline void stop_boot_trace(void) { } + +#define __notrace_funcgraph + +static inline void ftrace_graph_init_task(struct task_struct *t) { } +static inline void ftrace_graph_exit_task(struct task_struct *t) { } + +static inline int task_curr_ret_stack(struct task_struct *tsk) +{ + return -1; +} + +static inline void pause_graph_tracing(void) { } +static inline void unpause_graph_tracing(void) { } #endif +#ifdef CONFIG_TRACING +#include <linux/sched.h> + +/* flags for current->trace */ +enum { + TSK_TRACE_FL_TRACE_BIT = 0, + TSK_TRACE_FL_GRAPH_BIT = 1, +}; +enum { + TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT, + TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT, +}; + +static inline void set_tsk_trace_trace(struct task_struct *tsk) +{ + set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); +} + +static inline void clear_tsk_trace_trace(struct task_struct *tsk) +{ + clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); +} + +static inline int test_tsk_trace_trace(struct task_struct *tsk) +{ + return tsk->trace & TSK_TRACE_FL_TRACE; +} + +static inline void set_tsk_trace_graph(struct task_struct *tsk) +{ + set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); +} + +static inline void clear_tsk_trace_graph(struct task_struct *tsk) +{ + clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); +} + +static inline int test_tsk_trace_graph(struct task_struct *tsk) +{ + return tsk->trace & TSK_TRACE_FL_GRAPH; +} +#endif /* CONFIG_TRACING */ #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h new file mode 100644 index 00000000000..366a054d0b0 --- /dev/null +++ b/include/linux/ftrace_irq.h @@ -0,0 +1,13 @@ +#ifndef _LINUX_FTRACE_IRQ_H +#define _LINUX_FTRACE_IRQ_H + + +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); +#else +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } +#endif + +#endif /* _LINUX_FTRACE_IRQ_H */ diff --git a/include/linux/futex.h b/include/linux/futex.h index 586ab56a3ec..8f627b9ae2b 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -164,6 +164,8 @@ union futex_key { } both; }; +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } + #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); extern void exit_pi_state_list(struct task_struct *curr); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006cc94a..89a56d79e4c 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,6 +4,7 @@ #include <linux/preempt.h> #include <linux/smp_lock.h> #include <linux/lockdep.h> +#include <linux/ftrace_irq.h> #include <asm/hardirq.h> #include <asm/system.h> @@ -161,7 +162,17 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) +#define nmi_enter() \ + do { \ + ftrace_nmi_enter(); \ + lockdep_off(); \ + __irq_enter(); \ + } while (0) +#define nmi_exit() \ + do { \ + __irq_exit(); \ + lockdep_on(); \ + ftrace_nmi_exit(); \ + } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929..7e85a6e89e4 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,6 +14,8 @@ #include <linux/irqflags.h> #include <linux/smp.h> #include <linux/percpu.h> +#include <linux/irqnr.h> + #include <asm/atomic.h> #include <asm/ptrace.h> #include <asm/system.h> @@ -109,13 +111,13 @@ extern void enable_irq(unsigned int irq); extern cpumask_t irq_default_affinity; -extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask); +extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); #else /* CONFIG_SMP */ -static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } diff --git a/include/linux/irq.h b/include/linux/irq.h index 3dddfa703eb..fde5e613201 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -113,7 +113,8 @@ struct irq_chip { void (*eoi)(unsigned int irq); void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, cpumask_t dest); + void (*set_affinity)(unsigned int irq, + const struct cpumask *dest); int (*retrigger)(unsigned int irq); int (*set_type)(unsigned int irq, unsigned int flow_type); int (*set_wake)(unsigned int irq, unsigned int on); @@ -129,6 +130,8 @@ struct irq_chip { const char *typename; }; +struct timer_rand_state; +struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor @@ -154,6 +157,13 @@ struct irq_chip { */ struct irq_desc { unsigned int irq; +#ifdef CONFIG_SPARSE_IRQ + struct timer_rand_state *timer_rand_state; + unsigned int *kstat_irqs; +# ifdef CONFIG_INTR_REMAP + struct irq_2_iommu *irq_2_iommu; +# endif +#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -181,12 +191,51 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; +extern void early_irq_init(void); +extern void arch_early_irq_init(void); +extern void arch_init_chip_data(struct irq_desc *desc, int cpu); +extern void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu); +extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); +#ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; static inline struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < nr_irqs) ? irq_desc + irq : NULL; + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} +static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + return irq_to_desc(irq); +} + +#else + +extern struct irq_desc *irq_to_desc(unsigned int irq); +extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq)) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq)) + +#define kstat_irqs_this_cpu(DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]) +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]++) + +#endif + +static inline struct irq_desc * +irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) +{ +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + return irq_to_desc(irq); +#else + return desc; +#endif } /* @@ -380,6 +429,11 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #define get_irq_data(irq) (irq_to_desc(irq)->handler_data) #define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) +#define get_irq_desc_chip(desc) ((desc)->chip) +#define get_irq_desc_chip_data(desc) ((desc)->chip_data) +#define get_irq_desc_data(desc) ((desc)->handler_data) +#define get_irq_desc_msi(desc) ((desc)->msi_desc) + #endif /* CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 452c280c811..95d2b74641f 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -1,24 +1,38 @@ #ifndef _LINUX_IRQNR_H #define _LINUX_IRQNR_H +/* + * Generic irq_desc iterators: + */ +#ifdef __KERNEL__ + #ifndef CONFIG_GENERIC_HARDIRQS #include <asm/irq.h> # define nr_irqs NR_IRQS # define for_each_irq_desc(irq, desc) \ for (irq = 0; irq < nr_irqs; irq++) + +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1; irq >= 0; irq--) #else + extern int nr_irqs; +#ifndef CONFIG_SPARSE_IRQ + +struct irq_desc; # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) - -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ - irq >= 0; irq--, desc--) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ + irq >= 0; irq--, desc--) +#endif #endif -#define for_each_irq_nr(irq) \ - for (irq = 0; irq < nr_irqs; irq++) +#define for_each_irq_nr(irq) \ + for (irq = 0; irq < nr_irqs; irq++) + +#endif /* __KERNEL__ */ #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index dc7e0d0a647..269df5a17b3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -141,6 +141,15 @@ extern int _cond_resched(void); (__x < 0) ? -__x : __x; \ }) +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void); +#else +static inline void might_fault(void) +{ + might_sleep(); +} +#endif + extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) @@ -188,6 +197,8 @@ extern unsigned long long memparse(const char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); +extern int func_ptr_is_kernel_text(void *ptr); + struct pid; extern struct pid *session_of_pgrp(struct pid *pgrp); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee0..4ee4b3d2316 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,9 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; - unsigned int irqs[NR_IRQS]; +#ifndef CONFIG_SPARSE_IRQ + unsigned int irqs[NR_IRQS]; +#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); @@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +#ifndef CONFIG_SPARSE_IRQ +#define kstat_irqs_this_cpu(irq) \ + (kstat_this_cpu.irqs[irq]) + struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, @@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, { kstat_this_cpu.irqs[irq]++; } +#endif + +#ifndef CONFIG_SPARSE_IRQ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; } +#else +extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); +#endif /* * Number of interrupts per specific IRQ source, since bootup diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 17f76fc0517..adc34f2c6ef 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -100,6 +100,10 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + +#ifdef ARCH_HAS_KIMAGE_ARCH + struct kimage_arch arch; +#endif }; diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 29aec6e1002..8956daf64ab 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -73,6 +73,8 @@ struct lock_class_key { struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; }; +#define LOCKSTAT_POINTS 4 + /* * The lock-class itself: */ @@ -119,7 +121,8 @@ struct lock_class { int name_version; #ifdef CONFIG_LOCK_STAT - unsigned long contention_point[4]; + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; #endif }; @@ -144,6 +147,7 @@ enum bounce_type { struct lock_class_stats { unsigned long contention_point[4]; + unsigned long contending_point[4]; struct lock_time read_waittime; struct lock_time write_waittime; struct lock_time read_holdtime; @@ -165,6 +169,7 @@ struct lockdep_map { const char *name; #ifdef CONFIG_LOCK_STAT int cpu; + unsigned long ip; #endif }; @@ -356,7 +361,7 @@ struct lock_class_key { }; #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); -extern void lock_acquired(struct lockdep_map *lock); +extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ @@ -364,13 +369,13 @@ do { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ - lock_acquired(&(_lock)->dep_map); \ + lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) -#define lock_acquired(lockdep_map) do {} while (0) +#define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) @@ -481,4 +486,22 @@ static inline void print_irqtrace_events(struct task_struct *curr) # define lock_map_release(l) do { } while (0) #endif +#ifdef CONFIG_PROVE_LOCKING +# define might_lock(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +# define might_lock_read(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +#else +# define might_lock(lock) do { } while (0) +# define might_lock_read(lock) do { } while (0) +#endif + #endif /* __LINUX_LOCKDEP_H */ diff --git a/include/linux/marker.h b/include/linux/marker.h index 889196c7fbb..b85e74ca782 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -12,6 +12,7 @@ * See the file COPYING for more details. */ +#include <stdarg.h> #include <linux/types.h> struct module; @@ -48,10 +49,28 @@ struct marker { void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; + const char *tp_name; /* Optional tracepoint name */ + void *tp_cb; /* Optional tracepoint callback */ } __attribute__((aligned(8))); #ifdef CONFIG_MARKERS +#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format) \ + static const char __mstrtab_##name[] \ + __attribute__((section("__markers_strings"))) \ + = #name "\0" format; \ + static struct marker __mark_##name \ + __attribute__((section("__markers"), aligned(8))) = \ + { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ + 0, 0, marker_probe_cb, { __mark_empty_function, NULL},\ + NULL, tp_name_str, tp_cb } + +#define DEFINE_MARKER(name, format) \ + _DEFINE_MARKER(name, NULL, NULL, format) + +#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format) \ + _DEFINE_MARKER(name, #tp_name, tp_cb, format) + /* * Note : the empty asm volatile with read constraint is used here instead of a * "used" attribute to fix a gcc 4.1.x bug. @@ -65,14 +84,7 @@ struct marker { */ #define __trace_mark(generic, name, call_private, format, args...) \ do { \ - static const char __mstrtab_##name[] \ - __attribute__((section("__markers_strings"))) \ - = #name "\0" format; \ - static struct marker __mark_##name \ - __attribute__((section("__markers"), aligned(8))) = \ - { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ - 0, 0, marker_probe_cb, \ - { __mark_empty_function, NULL}, NULL }; \ + DEFINE_MARKER(name, format); \ __mark_check_format(format, ## args); \ if (unlikely(__mark_##name.state)) { \ (*__mark_##name.call) \ @@ -80,14 +92,39 @@ struct marker { } \ } while (0) +#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \ + do { \ + void __check_tp_type(void) \ + { \ + register_trace_##tp_name(tp_cb); \ + } \ + DEFINE_MARKER_TP(name, tp_name, tp_cb, format); \ + __mark_check_format(format, ## args); \ + (*__mark_##name.call)(&__mark_##name, call_private, \ + ## args); \ + } while (0) + extern void marker_update_probe_range(struct marker *begin, struct marker *end); + +#define GET_MARKER(name) (__mark_##name) + #else /* !CONFIG_MARKERS */ +#define DEFINE_MARKER(name, tp_name, tp_cb, format) #define __trace_mark(generic, name, call_private, format, args...) \ __mark_check_format(format, ## args) +#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \ + do { \ + void __check_tp_type(void) \ + { \ + register_trace_##tp_name(tp_cb); \ + } \ + __mark_check_format(format, ## args); \ + } while (0) static inline void marker_update_probe_range(struct marker *begin, struct marker *end) { } +#define GET_MARKER(name) #endif /* CONFIG_MARKERS */ /** @@ -117,6 +154,20 @@ static inline void marker_update_probe_range(struct marker *begin, __trace_mark(1, name, NULL, format, ## args) /** + * trace_mark_tp - Marker in a tracepoint callback + * @name: marker name, not quoted. + * @tp_name: tracepoint name, not quoted. + * @tp_cb: tracepoint callback. Should have an associated global symbol so it + * is not optimized away by the compiler (should not be static). + * @format: format string + * @args...: variable argument list + * + * Places a marker in a tracepoint callback. + */ +#define trace_mark_tp(name, tp_name, tp_cb, format, args...) \ + __trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args) + +/** * MARK_NOARGS - Format string for a marker with no argument. */ #define MARK_NOARGS " " @@ -136,8 +187,6 @@ extern marker_probe_func __mark_empty_function; extern void marker_probe_cb(const struct marker *mdata, void *call_private, ...); -extern void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, ...); /* * Connect a probe to a marker. @@ -162,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe, /* * marker_synchronize_unregister must be called between the last marker probe - * unregistration and the end of module exit to make sure there is no caller - * executing a probe when it is freed. + * unregistration and the first one of + * - the end of module exit function + * - the free of any resource used by the probes + * to ensure the code and data are valid for any possibly running probes. */ #define marker_synchronize_unregister() synchronize_sched() diff --git a/include/linux/msi.h b/include/linux/msi.h index 8f293922720..d2b8a1e8ca1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -10,8 +10,11 @@ struct msi_msg { }; /* Helper functions */ +struct irq_desc; extern void mask_msi_irq(unsigned int irq); extern void unmask_msi_irq(unsigned int irq); +extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); +extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); extern void read_msi_msg(unsigned int irq, struct msi_msg *msg); extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index bc6da10ceee..7a0e5c4f807 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! + * + * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); diff --git a/include/linux/pid.h b/include/linux/pid.h index d7e98ff8021..bb206c56d1f 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ do { \ struct hlist_node *pos___; \ - if (pid != NULL) \ + if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), pos___, \ - &pid->tasks[type], pids[type].node) { + &(pid)->tasks[type], pids[type].node) { /* * Both old and new leaders may be attached to diff --git a/include/linux/random.h b/include/linux/random.h index 36f125c0c60..adbf3bd3c6b 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -8,6 +8,7 @@ #define _LINUX_RANDOM_H #include <linux/ioctl.h> +#include <linux/irqnr.h> /* ioctl()'s for the random number generator */ @@ -44,6 +45,56 @@ struct rand_pool_info { extern void rand_initialize_irq(int irq); +struct timer_rand_state; +#ifndef CONFIG_SPARSE_IRQ + +extern struct timer_rand_state *irq_timer_state[]; + +static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + if (irq >= nr_irqs) + return NULL; + + return irq_timer_state[irq]; +} + +static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + if (irq >= nr_irqs) + return; + + irq_timer_state[irq] = state; +} + +#else + +#include <linux/irq.h> +static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (!desc) + return NULL; + + return desc->timer_rand_state; +} + +static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (!desc) + return; + + desc->timer_rand_state = state; +} +#endif + + extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); extern void add_interrupt_randomness(int irq); diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 5f89b62e698..301dda829e3 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -41,7 +41,7 @@ #include <linux/seqlock.h> #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */ #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 86f1f5e43e3..895dc9c1088 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -142,6 +142,7 @@ struct rcu_head { * on the write-side to insure proper synchronization. */ #define rcu_read_lock_sched() preempt_disable() +#define rcu_read_lock_sched_notrace() preempt_disable_notrace() /* * rcu_read_unlock_sched - marks the end of a RCU-classic critical section @@ -149,6 +150,7 @@ struct rcu_head { * See rcu_read_lock_sched for more information. */ #define rcu_read_unlock_sched() preempt_enable() +#define rcu_read_unlock_sched_notrace() preempt_enable_notrace() diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e097c2e6b6d..d363467c8f1 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -28,17 +28,19 @@ struct ring_buffer_event { * size = 8 bytes * * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock - * array[0] = tv_nsec - * array[1] = tv_sec + * array[0] = tv_nsec + * array[1..2] = tv_sec * size = 16 bytes * * @RINGBUF_TYPE_DATA: Data record * If len is zero: * array[0] holds the actual length - * array[1..(length+3)/4-1] holds data + * array[1..(length+3)/4] holds data + * size = 4 + 4 + length (bytes) * else * length = len << 2 - * array[0..(length+3)/4] holds data + * array[0..(length+3)/4-1] holds data + * size = 4 + length (bytes) */ enum ring_buffer_type { RINGBUF_TYPE_PADDING, @@ -122,6 +124,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); void tracing_on(void); void tracing_off(void); +void tracing_off_permanent(void); + +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d11447..4240f6bfa81 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,6 +96,7 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; +struct bts_tracer; /* * List of flags we want to share for kernel threads, @@ -249,7 +250,7 @@ extern void init_idle_bootup_task(struct task_struct *idle); extern int runqueue_is_locked(void); extern void task_rq_unlock_wait(struct task_struct *p); -extern cpumask_t nohz_cpu_mask; +extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); #else @@ -259,8 +260,6 @@ static inline int select_nohz_load_balancer(int cpu) } #endif -extern unsigned long rt_needs_cpu(int cpu); - /* * Only dump TASK_* tasks. (0 for all tasks) */ @@ -777,7 +776,6 @@ enum cpu_idle_type { struct sched_group { struct sched_group *next; /* Must be a circular list */ - cpumask_t cpumask; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@ -790,8 +788,15 @@ struct sched_group { * (see include/linux/reciprocal_div.h) */ u32 reciprocal_cpu_power; + + unsigned long cpumask[]; }; +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + enum sched_domain_level { SD_LV_NONE = 0, SD_LV_SIBLING, @@ -815,7 +820,6 @@ struct sched_domain { struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ - cpumask_t span; /* span of all CPUs in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ @@ -870,9 +874,17 @@ struct sched_domain { #ifdef CONFIG_SCHED_DEBUG char *name; #endif + + /* span of all CPUs in this domain */ + unsigned long span[]; }; -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +static inline struct cpumask *sched_domain_span(struct sched_domain *sd) +{ + return to_cpumask(sd->span); +} + +extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new); extern int arch_reinit_sched_domains(void); @@ -881,7 +893,7 @@ extern int arch_reinit_sched_domains(void); struct sched_domain_attr; static inline void -partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { } @@ -963,7 +975,7 @@ struct sched_class { void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const cpumask_t *newmask); + const struct cpumask *newmask); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); @@ -1165,6 +1177,18 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; +#ifdef CONFIG_X86_PTRACE_BTS + /* + * This is the tracer handle for the ptrace BTS extension. + * This field actually belongs to the ptracer task. + */ + struct bts_tracer *bts; + /* + * The buffer to hold the BTS data. + */ + void *bts_buffer; +#endif /* CONFIG_X86_PTRACE_BTS */ + /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; @@ -1356,6 +1380,23 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack *ret_stack; + /* + * Number of functions that haven't been traced + * because of depth overrun. + */ + atomic_t trace_overrun; + /* Pause for the tracing */ + atomic_t tracing_graph_pause; +#endif +#ifdef CONFIG_TRACING + /* state flags for use by tracers */ + unsigned long trace; +#endif }; /* @@ -1594,12 +1635,12 @@ extern cputime_t task_gtime(struct task_struct *p); #ifdef CONFIG_SMP extern int set_cpus_allowed_ptr(struct task_struct *p, - const cpumask_t *new_mask); + const struct cpumask *new_mask); #else static inline int set_cpus_allowed_ptr(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - if (!cpu_isset(0, *new_mask)) + if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } @@ -2212,8 +2253,8 @@ __trace_special(void *__tr, void *__data, } #endif -extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); -extern long sched_getaffinity(pid_t pid, cpumask_t *mask); +extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); +extern long sched_getaffinity(pid_t pid, struct cpumask *mask); extern int sched_mc_power_savings, sched_smt_power_savings; @@ -2224,6 +2265,7 @@ extern void normalize_rt_tasks(void); extern struct task_group init_task_group; #ifdef CONFIG_USER_SCHED extern struct task_group root_task_group; +extern void set_tg_uid(struct user_struct *user); #endif extern struct task_group *sched_create_group(struct task_group *parent); diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index dc50bcc282a..b3dfa72f13b 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -34,6 +34,7 @@ struct seq_operations { #define SEQ_SKIP 1 +char *mangle_path(char *s, char *p, char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); loff_t seq_lseek(struct file *, loff_t, int); diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index b106fd8e0d5..1a8cecc4f38 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); extern void print_stack_trace(struct stack_trace *trace, int spaces); + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +extern void save_stack_trace_user(struct stack_trace *trace); +#else +# define save_stack_trace_user(trace) do { } while (0) +#endif + #else # define save_stack_trace(trace) do { } while (0) # define save_stack_trace_tsk(tsk, trace) do { } while (0) +# define save_stack_trace_user(trace) do { } while (0) # define print_stack_trace(trace, spaces) do { } while (0) #endif diff --git a/include/linux/topology.h b/include/linux/topology.h index 117f1b7405c..0c5b5ac36d8 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -49,7 +49,7 @@ for_each_online_node(node) \ if (nr_cpus_node(node)) -void arch_update_cpu_topology(void); +int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index c5bb39c7a77..75700545836 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -24,8 +24,12 @@ struct tracepoint { const char *name; /* Tracepoint name */ int state; /* State. */ void **funcs; -} __attribute__((aligned(8))); - +} __attribute__((aligned(32))); /* + * Aligned on 32 bytes because it is + * globally visible and gcc happily + * align these on the structure size. + * Keep in sync with vmlinux.lds.h. + */ #define TPPROTO(args...) args #define TPARGS(args...) args @@ -40,14 +44,14 @@ struct tracepoint { do { \ void **it_func; \ \ - rcu_read_lock_sched(); \ + rcu_read_lock_sched_notrace(); \ it_func = rcu_dereference((tp)->funcs); \ if (it_func) { \ do { \ ((void(*)(proto))(*it_func))(args); \ } while (*(++it_func)); \ } \ - rcu_read_unlock_sched(); \ + rcu_read_unlock_sched_notrace(); \ } while (0) /* @@ -55,35 +59,40 @@ struct tracepoint { * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. */ -#define DEFINE_TRACE(name, proto, args) \ +#define DECLARE_TRACE(name, proto, args) \ + extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - static const char __tpstrtab_##name[] \ - __attribute__((section("__tracepoints_strings"))) \ - = #name ":" #proto; \ - static struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"), aligned(8))) = \ - { __tpstrtab_##name, 0, NULL }; \ if (unlikely(__tracepoint_##name.state)) \ __DO_TRACE(&__tracepoint_##name, \ TPPROTO(proto), TPARGS(args)); \ } \ static inline int register_trace_##name(void (*probe)(proto)) \ { \ - return tracepoint_probe_register(#name ":" #proto, \ - (void *)probe); \ + return tracepoint_probe_register(#name, (void *)probe); \ } \ - static inline void unregister_trace_##name(void (*probe)(proto))\ + static inline int unregister_trace_##name(void (*probe)(proto)) \ { \ - tracepoint_probe_unregister(#name ":" #proto, \ - (void *)probe); \ + return tracepoint_probe_unregister(#name, (void *)probe);\ } +#define DEFINE_TRACE(name) \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) = #name; \ + struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"), aligned(32))) = \ + { __tpstrtab_##name, 0, NULL } + +#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ + EXPORT_SYMBOL_GPL(__tracepoint_##name) +#define EXPORT_TRACEPOINT_SYMBOL(name) \ + EXPORT_SYMBOL(__tracepoint_##name) + extern void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end); #else /* !CONFIG_TRACEPOINTS */ -#define DEFINE_TRACE(name, proto, args) \ +#define DECLARE_TRACE(name, proto, args) \ static inline void _do_trace_##name(struct tracepoint *tp, proto) \ { } \ static inline void trace_##name(proto) \ @@ -92,8 +101,14 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, { \ return -ENOSYS; \ } \ - static inline void unregister_trace_##name(void (*probe)(proto))\ - { } + static inline int unregister_trace_##name(void (*probe)(proto)) \ + { \ + return -ENOSYS; \ + } + +#define DEFINE_TRACE(name) +#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) +#define EXPORT_TRACEPOINT_SYMBOL(name) static inline void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) @@ -112,6 +127,10 @@ extern int tracepoint_probe_register(const char *name, void *probe); */ extern int tracepoint_probe_unregister(const char *name, void *probe); +extern int tracepoint_probe_register_noupdate(const char *name, void *probe); +extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe); +extern void tracepoint_probe_update_all(void); + struct tracepoint_iter { struct module *module; struct tracepoint *tracepoint; diff --git a/include/linux/tty.h b/include/linux/tty.h index 3b8121d4e36..eaec37c9d83 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -325,7 +325,7 @@ extern struct class *tty_class; * go away */ -extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty) +static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index fec6decfb98..6b58367d145 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ + ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ diff --git a/include/trace/block.h b/include/trace/block.h new file mode 100644 index 00000000000..25c6a1fd5b7 --- /dev/null +++ b/include/trace/block.h @@ -0,0 +1,76 @@ +#ifndef _TRACE_BLOCK_H +#define _TRACE_BLOCK_H + +#include <linux/blkdev.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(block_rq_abort, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_insert, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_issue, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_requeue, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_complete, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); + +DECLARE_TRACE(block_bio_bounce, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_complete, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_backmerge, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_frontmerge, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_queue, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); + +DECLARE_TRACE(block_getrq, + TPPROTO(struct request_queue *q, struct bio *bio, int rw), + TPARGS(q, bio, rw)); + +DECLARE_TRACE(block_sleeprq, + TPPROTO(struct request_queue *q, struct bio *bio, int rw), + TPARGS(q, bio, rw)); + +DECLARE_TRACE(block_plug, + TPPROTO(struct request_queue *q), + TPARGS(q)); + +DECLARE_TRACE(block_unplug_timer, + TPPROTO(struct request_queue *q), + TPARGS(q)); + +DECLARE_TRACE(block_unplug_io, + TPPROTO(struct request_queue *q), + TPARGS(q)); + +DECLARE_TRACE(block_split, + TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), + TPARGS(q, bio, pdu)); + +DECLARE_TRACE(block_remap, + TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, + sector_t from, sector_t to), + TPARGS(q, bio, dev, from, to)); + +#endif diff --git a/include/trace/boot.h b/include/trace/boot.h new file mode 100644 index 00000000000..088ea089e31 --- /dev/null +++ b/include/trace/boot.h @@ -0,0 +1,60 @@ +#ifndef _LINUX_TRACE_BOOT_H +#define _LINUX_TRACE_BOOT_H + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/init.h> + +/* + * Structure which defines the trace of an initcall + * while it is called. + * You don't have to fill the func field since it is + * only used internally by the tracer. + */ +struct boot_trace_call { + pid_t caller; + char func[KSYM_SYMBOL_LEN]; +}; + +/* + * Structure which defines the trace of an initcall + * while it returns. + */ +struct boot_trace_ret { + char func[KSYM_SYMBOL_LEN]; + int result; + unsigned long long duration; /* nsecs */ +}; + +#ifdef CONFIG_BOOT_TRACER +/* Append the traces on the ring-buffer */ +extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn); +extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn); + +/* Tells the tracer that smp_pre_initcall is finished. + * So we can start the tracing + */ +extern void start_boot_trace(void); + +/* Resume the tracing of other necessary events + * such as sched switches + */ +extern void enable_boot_trace(void); + +/* Suspend this tracing. Actually, only sched_switches tracing have + * to be suspended. Initcalls doesn't need it.) + */ +extern void disable_boot_trace(void); +#else +static inline +void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { } + +static inline +void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { } + +static inline void start_boot_trace(void) { } +static inline void enable_boot_trace(void) { } +static inline void disable_boot_trace(void) { } +#endif /* CONFIG_BOOT_TRACER */ + +#endif /* __LINUX_TRACE_BOOT_H */ diff --git a/include/trace/sched.h b/include/trace/sched.h index ad47369d01b..9b2854abf7e 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -4,52 +4,52 @@ #include <linux/sched.h> #include <linux/tracepoint.h> -DEFINE_TRACE(sched_kthread_stop, +DECLARE_TRACE(sched_kthread_stop, TPPROTO(struct task_struct *t), TPARGS(t)); -DEFINE_TRACE(sched_kthread_stop_ret, +DECLARE_TRACE(sched_kthread_stop_ret, TPPROTO(int ret), TPARGS(ret)); -DEFINE_TRACE(sched_wait_task, +DECLARE_TRACE(sched_wait_task, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p)); -DEFINE_TRACE(sched_wakeup, +DECLARE_TRACE(sched_wakeup, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p)); -DEFINE_TRACE(sched_wakeup_new, +DECLARE_TRACE(sched_wakeup_new, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p)); -DEFINE_TRACE(sched_switch, +DECLARE_TRACE(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TPARGS(rq, prev, next)); -DEFINE_TRACE(sched_migrate_task, +DECLARE_TRACE(sched_migrate_task, TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), TPARGS(rq, p, dest_cpu)); -DEFINE_TRACE(sched_process_free, +DECLARE_TRACE(sched_process_free, TPPROTO(struct task_struct *p), TPARGS(p)); -DEFINE_TRACE(sched_process_exit, +DECLARE_TRACE(sched_process_exit, TPPROTO(struct task_struct *p), TPARGS(p)); -DEFINE_TRACE(sched_process_wait, +DECLARE_TRACE(sched_process_wait, TPPROTO(struct pid *pid), TPARGS(pid)); -DEFINE_TRACE(sched_process_fork, +DECLARE_TRACE(sched_process_fork, TPPROTO(struct task_struct *parent, struct task_struct *child), TPARGS(parent, child)); -DEFINE_TRACE(sched_signal_send, +DECLARE_TRACE(sched_signal_send, TPPROTO(int sig, struct task_struct *p), TPARGS(sig, p)); diff --git a/init/Kconfig b/init/Kconfig index f763762d544..b3782c6d5ed 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -808,6 +808,7 @@ config TRACEPOINTS config MARKERS bool "Activate markers" + depends on TRACEPOINTS help Place an empty function call at each marker site. Can be dynamically changed for a probe function. @@ -916,6 +917,15 @@ config KMOD endif # MODULES +config INIT_ALL_POSSIBLE + bool + help + Back when each arch used to define their own cpu_online_map and + cpu_possible_map, some of them chose to initialize cpu_possible_map + with all 1s, and others with all 0s. When they were centralised, + it was better to provide this option than to break all the archs + and have several arch maintainers persuing me down dark alleys. + config STOP_MACHINE bool default y diff --git a/init/main.c b/init/main.c index 7e117a231af..9d761aa5329 100644 --- a/init/main.c +++ b/init/main.c @@ -63,6 +63,7 @@ #include <linux/signal.h> #include <linux/idr.h> #include <linux/ftrace.h> +#include <trace/boot.h> #include <asm/io.h> #include <asm/bugs.h> @@ -539,6 +540,15 @@ void __init __weak thread_info_cache_init(void) { } +void __init __weak arch_early_irq_init(void) +{ +} + +void __init __weak early_irq_init(void) +{ + arch_early_irq_init(); +} + asmlinkage void __init start_kernel(void) { char * command_line; @@ -603,6 +613,8 @@ asmlinkage void __init start_kernel(void) sort_main_extable(); trap_init(); rcu_init(); + /* init some links before init_ISA_irqs() */ + early_irq_init(); init_IRQ(); pidhash_init(); init_timers(); @@ -703,31 +715,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644); int do_one_initcall(initcall_t fn) { int count = preempt_count(); - ktime_t delta; + ktime_t calltime, delta, rettime; char msgbuf[64]; - struct boot_trace it; + struct boot_trace_call call; + struct boot_trace_ret ret; if (initcall_debug) { - it.caller = task_pid_nr(current); - printk("calling %pF @ %i\n", fn, it.caller); - it.calltime = ktime_get(); + call.caller = task_pid_nr(current); + printk("calling %pF @ %i\n", fn, call.caller); + calltime = ktime_get(); + trace_boot_call(&call, fn); + enable_boot_trace(); } - it.result = fn(); + ret.result = fn(); if (initcall_debug) { - it.rettime = ktime_get(); - delta = ktime_sub(it.rettime, it.calltime); - it.duration = (unsigned long long) delta.tv64 >> 10; + disable_boot_trace(); + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; + trace_boot_ret(&ret, fn); printk("initcall %pF returned %d after %Ld usecs\n", fn, - it.result, it.duration); - trace_boot(&it, fn); + ret.result, ret.duration); } msgbuf[0] = 0; - if (it.result && it.result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", it.result); + if (ret.result && ret.result != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", ret.result); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -741,7 +757,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return it.result; + return ret.result; } @@ -882,7 +898,7 @@ static int __init kernel_init(void * unused) * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ - stop_boot_trace(); + init_post(); return 0; } diff --git a/kernel/Makefile b/kernel/Makefile index 19fad003b19..6a212b842d8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ea32e8d68b..bae131a1211 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,19 +24,20 @@ cpumask_t cpu_present_map __read_mostly; EXPORT_SYMBOL(cpu_present_map); -#ifndef CONFIG_SMP - /* * Represents all cpu's that are currently online. */ -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); +#ifdef CONFIG_INIT_ALL_POSSIBLE cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +#else +cpumask_t cpu_possible_map __read_mostly; +#endif EXPORT_SYMBOL(cpu_possible_map); -#else /* CONFIG_SMP */ - +#ifdef CONFIG_SMP /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 96c0ba13b8c..39c1a4c1c5a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs.cpus_allowed); + retval = cpulist_parse(buf, &trialcs.cpus_allowed); if (retval < 0) return retval; @@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) mask = cs->cpus_allowed; mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, mask); + return cpulist_scnprintf(page, PAGE_SIZE, &mask); } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f..61ba5b4b10c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,10 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); + static void exit_mm(struct task_struct * tsk); static inline int task_detached(struct task_struct *p) @@ -1123,7 +1127,6 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1321,10 +1324,10 @@ static int wait_task_zombie(struct task_struct *p, int options, * group, which consolidates times for all threads in the * group including the group leader. */ + thread_group_cputime(p, &cputime); spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; - thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, cputime_add(cputime.utime, diff --git a/kernel/extable.c b/kernel/extable.c index a26cb2e1702..e136ed8d82b 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -17,6 +17,7 @@ */ #include <linux/module.h> #include <linux/init.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/sections.h> @@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -int core_kernel_text(unsigned long addr) +__notrace_funcgraph int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr) return 0; } -int __kernel_text_address(unsigned long addr) +__notrace_funcgraph int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; @@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr) return 1; return module_text_address(addr) != NULL; } + +/* + * On some architectures (PPC64, IA64) function pointers + * are actually only tokens to some data that then holds the + * real function address. As a result, to find if a function + * pointer is part of the kernel text, we need to do some + * special dereferencing first. + */ +int func_ptr_is_kernel_text(void *ptr) +{ + unsigned long addr; + addr = (unsigned long) dereference_function_descriptor(ptr); + if (core_kernel_text(addr)) + return 1; + return module_text_address(addr) != NULL; +} diff --git a/kernel/fork.c b/kernel/fork.c index 495da2e9a8b..7b93da72d4a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@ #include <linux/mount.h> #include <linux/audit.h> #include <linux/memcontrol.h> +#include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> @@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +DEFINE_TRACE(sched_process_fork); + int nr_processes(void) { int cpu; @@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1136,6 +1140,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + ftrace_graph_init_task(p); + p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1144,7 +1150,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_pid; + goto bad_fork_free_graph; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1237,7 +1243,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_free_graph; } if (clone_flags & CLONE_THREAD) { @@ -1274,6 +1280,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; +bad_fork_free_graph: + ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); diff --git a/kernel/futex.c b/kernel/futex.c index 8af10027514..e10c5c8786a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -123,24 +123,6 @@ struct futex_hash_bucket { static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; /* - * Take mm->mmap_sem, when futex is shared - */ -static inline void futex_lock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - down_read(fshared); -} - -/* - * Release mm->mmap_sem, when the futex is shared - */ -static inline void futex_unlock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - up_read(fshared); -} - -/* * We hash on the keys returned from get_futex_key (see below). */ static struct futex_hash_bucket *hash_futex(union futex_key *key) @@ -161,6 +143,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + */ +static void get_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + atomic_inc(&key->shared.inode->i_count); + break; + case FUT_OFF_MMSHARED: + atomic_inc(&key->private.mm->mm_count); + break; + } +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static void drop_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + iput(key->shared.inode); + break; + case FUT_OFF_MMSHARED: + mmdrop(key->private.mm); + break; + } +} + /** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex @@ -179,12 +200,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * For other futexes, it points to ¤t->mm->mmap_sem and * caller must have taken the reader lock. but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, - union futex_key *key) +static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; struct page *page; int err; @@ -208,100 +227,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, return -EFAULT; key->private.mm = mm; key->private.address = address; + get_futex_key_refs(key); return 0; } - /* - * The futex is hashed differently depending on whether - * it's in a shared or private mapping. So check vma first. - */ - vma = find_extend_vma(mm, address); - if (unlikely(!vma)) - return -EFAULT; - /* - * Permissions. - */ - if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) - return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; +again: + err = get_user_pages_fast(address, 1, 0, &page); + if (err < 0) + return err; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + put_page(page); + goto again; + } /* * Private mappings are handled in a simple way. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to - * the object not the particular process. Therefore we use - * VM_MAYSHARE here, not VM_SHARED which is restricted to shared - * mappings of _writable_ handles. + * the object not the particular process. */ - if (likely(!(vma->vm_flags & VM_MAYSHARE))) { - key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ + if (PageAnon(page)) { + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; - return 0; + } else { + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.inode = page->mapping->host; + key->shared.pgoff = page->index; } - /* - * Linear file mappings are also simple. - */ - key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ - if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) - + vma->vm_pgoff); - return 0; - } + get_futex_key_refs(key); - /* - * We could walk the page table to read the non-linear - * pte, and get the page index without fetching the page - * from swap. But that's a lot of code to duplicate here - * for a rare case, so we simply fetch the page. - */ - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - if (err >= 0) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - put_page(page); - return 0; - } - return err; -} - -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (key->both.ptr == NULL) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); - break; - case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); - break; - } + unlock_page(page); + put_page(page); + return 0; } -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. - */ -static void drop_futex_key_refs(union futex_key *key) +static inline +void put_futex_key(int fshared, union futex_key *key) { - if (!key->both.ptr) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } + drop_futex_key_refs(key); } static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) @@ -328,10 +297,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) /* * Fault handling. - * if fshared is non NULL, current->mm->mmap_sem is already held */ -static int futex_handle_fault(unsigned long address, - struct rw_semaphore *fshared, int attempt) +static int futex_handle_fault(unsigned long address, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; @@ -340,8 +307,7 @@ static int futex_handle_fault(unsigned long address, if (attempt > 2) return ret; - if (!fshared) - down_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { @@ -361,8 +327,7 @@ static int futex_handle_fault(unsigned long address, current->min_flt++; } } - if (!fshared) - up_read(&mm->mmap_sem); + up_read(&mm->mmap_sem); return ret; } @@ -385,6 +350,7 @@ static int refill_pi_state_cache(void) /* pi_mutex gets initialized later */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@ -462,7 +428,7 @@ void exit_pi_state_list(struct task_struct *curr) struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; if (!futex_cmpxchg_enabled) return; @@ -719,20 +685,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake, u32 bitset) +static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; struct plist_head *head; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; int ret; if (!bitset) return -EINVAL; - futex_lock_mm(fshared); - ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -760,7 +723,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: - futex_unlock_mm(fshared); + put_futex_key(fshared, &key); return ret; } @@ -769,19 +732,16 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { - union futex_key key1, key2; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; retryfull: - futex_lock_mm(fshared); - ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -826,18 +786,12 @@ retry: */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); + attempt); if (ret) goto out; goto retry; } - /* - * If we would have faulted, release mmap_sem, - * fault it in and start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(dummy, uaddr2); if (ret) return ret; @@ -873,7 +827,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - futex_unlock_mm(fshared); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); return ret; } @@ -882,19 +837,16 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { - union futex_key key1, key2; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; retry: - futex_lock_mm(fshared); - ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -917,12 +869,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, if (hb1 != hb2) spin_unlock(&hb2->lock); - /* - * If we would have faulted, release mmap_sem, fault - * it in and start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(curval, uaddr1); if (!ret) @@ -974,7 +920,8 @@ out_unlock: drop_futex_key_refs(&key1); out: - futex_unlock_mm(fshared); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); return ret; } @@ -1096,8 +1043,7 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, - struct rw_semaphore *fshared) + struct task_struct *newowner, int fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@ -1176,7 +1122,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); + ret = futex_handle_fault((unsigned long)uaddr, attempt++); spin_lock(q->lock_ptr); @@ -1200,7 +1146,7 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, +static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; @@ -1218,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; q.bitset = bitset; retry: - futex_lock_mm(fshared); - + q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1251,12 +1196,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret)) { queue_unlock(&q, hb); - /* - * If we would have faulted, release mmap_sem, fault it in and - * start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret) @@ -1271,12 +1210,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, queue_me(&q, hb); /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - - /* * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it * faults, and we cannot just set TASK_INTERRUPTIBLE state when @@ -1363,7 +1296,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: - futex_unlock_mm(fshared); + put_futex_key(fshared, &q.key); return ret; } @@ -1371,13 +1304,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - struct rw_semaphore *fshared = NULL; + int fshared = 0; ktime_t t; t.tv64 = restart->futex.time; restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) - fshared = ¤t->mm->mmap_sem; + fshared = 1; return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, restart->futex.bitset); } @@ -1389,7 +1322,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, +static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -1412,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - futex_lock_mm(fshared); - + q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1502,7 +1434,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * exit to complete. */ queue_unlock(&q, hb); - futex_unlock_mm(fshared); cond_resched(); goto retry; @@ -1534,12 +1465,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ queue_me(&q, hb); - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - WARN_ON(!q.pi_state); /* * Block on the PI mutex: @@ -1552,7 +1477,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ret = ret ? 0 : -EWOULDBLOCK; } - futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@ -1618,7 +1542,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* Unqueue and drop the lock */ unqueue_me_pi(&q); - futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); @@ -1628,7 +1551,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: - futex_unlock_mm(fshared); + put_futex_key(fshared, &q.key); if (to) destroy_hrtimer_on_stack(&to->timer); return ret; @@ -1645,15 +1568,12 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); + ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out_release_sem; goto retry_unlocked; } - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; @@ -1668,13 +1588,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) +static int futex_unlock_pi(u32 __user *uaddr, int fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; u32 uval; struct plist_head *head; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; int ret, attempt = 0; retry: @@ -1685,10 +1605,6 @@ retry: */ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - /* - * First take all the futex related locks: - */ - futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -1747,7 +1663,7 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); out: - futex_unlock_mm(fshared); + put_futex_key(fshared, &key); return ret; @@ -1763,16 +1679,13 @@ pi_faulted: spin_unlock(&hb->lock); if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); + ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out; uval = 0; goto retry_unlocked; } - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; @@ -1898,8 +1811,7 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1, - FUTEX_BITSET_MATCH_ANY); + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } @@ -1995,10 +1907,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, { int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; - struct rw_semaphore *fshared = NULL; + int fshared = 0; if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = ¤t->mm->mmap_sem; + fshared = 1; switch (cmd) { case FUTEX_WAIT: diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 681c52dbfe2..4dd5b1edac9 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o +obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f7321b8c..650ce4102a6 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -40,6 +40,9 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -68,6 +71,9 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -86,6 +92,9 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); status = desc->status; @@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); status = desc->status; @@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); status = desc->status; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092e9bf..b343deedae9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -24,9 +24,10 @@ */ void dynamic_irq_init(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; unsigned long flags; + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; @@ -45,7 +46,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpus_setall(desc->affinity); + cpumask_setall(&desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } @@ -352,6 +353,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); + desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -429,6 +431,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); + desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -465,12 +468,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); + desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); + desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -531,8 +536,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) + if (desc->chip->eoi) { desc->chip->eoi(irq); + desc = irq_remap_to_desc(irq, desc); + } } void @@ -567,8 +574,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) + if (desc->chip != &no_irq_chip) { mask_ack_irq(desc, irq); + desc = irq_remap_to_desc(irq, desc); + } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c815b42d0f5..f1a23069c20 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -15,9 +15,16 @@ #include <linux/random.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/rculist.h> +#include <linux/hash.h> #include "internals.h" +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +struct lock_class_key irq_desc_lock_class; + /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); +void __init __attribute__((weak)) arch_early_irq_init(void) +{ +} + +#ifdef CONFIG_SPARSE_IRQ +static struct irq_desc irq_desc_init = { + .irq = -1, + .status = IRQ_DISABLED, + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif +}; + +void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +{ + unsigned long bytes; + char *ptr; + int node; + + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + + node = cpu_to_node(cpu); + ptr = kzalloc_node(bytes, GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node); + + if (ptr) + desc->kstat_irqs = (unsigned int *)ptr; +} + +void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu) +{ +} + +static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +{ + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); + desc->irq = irq; +#ifdef CONFIG_SMP + desc->cpu = cpu; +#endif + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_kstat_irqs(desc, cpu, nr_cpu_ids); + if (!desc->kstat_irqs) { + printk(KERN_ERR "can not alloc kstat_irqs\n"); + BUG_ON(1); + } + arch_init_chip_data(desc, cpu); +} + +/* + * Protect the sparse_irqs: + */ +DEFINE_SPINLOCK(sparse_irq_lock); + +struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; + +static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS_LEGACY-1] = { + .irq = -1, + .status = IRQ_DISABLED, + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif + } +}; + +/* FIXME: use bootmem alloc ...*/ +static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; + +void __init early_irq_init(void) +{ + struct irq_desc *desc; + int legacy_count; + int i; + + desc = irq_desc_legacy; + legacy_count = ARRAY_SIZE(irq_desc_legacy); + + for (i = 0; i < legacy_count; i++) { + desc[i].irq = i; + desc[i].kstat_irqs = kstat_irqs_legacy[i]; + + irq_desc_ptrs[i] = desc + i; + } + + for (i = legacy_count; i < NR_IRQS; i++) + irq_desc_ptrs[i] = NULL; + + arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; +} + +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc; + unsigned long flags; + int node; + + if (irq >= NR_IRQS) { + printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", + irq, NR_IRQS); + WARN_ON(1); + return NULL; + } + + desc = irq_desc_ptrs[irq]; + if (desc) + return desc; + + spin_lock_irqsave(&sparse_irq_lock, flags); + + /* We have to check it to avoid races with another CPU */ + desc = irq_desc_ptrs[irq]; + if (desc) + goto out_unlock; + + node = cpu_to_node(cpu); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", + irq, cpu, node); + if (!desc) { + printk(KERN_ERR "can not alloc irq_desc\n"); + BUG_ON(1); + } + init_one_irq_desc(irq, desc, cpu); + + irq_desc_ptrs[irq] = desc; + +out_unlock: + spin_unlock_irqrestore(&sparse_irq_lock, flags); + + return desc; +} + +#else + struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; +#endif + /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. @@ -179,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) + if (desc->chip->ack) { desc->chip->ack(irq); + /* get new one */ + desc = irq_remap_to_desc(irq, desc); + } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -191,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) + if (desc->chip->ack) { desc->chip->ack(irq); + desc = irq_remap_to_desc(irq, desc); + } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -261,17 +424,28 @@ out: #ifdef CONFIG_TRACE_IRQFLAGS -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - void early_init_irq_lock_class(void) { +#ifndef CONFIG_SPARSE_IRQ struct irq_desc *desc; int i; - for_each_irq_desc(i, desc) + for_each_irq_desc(i, desc) { + if (!desc) + continue; + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + } +#endif +} +#endif + +#ifdef CONFIG_SPARSE_IRQ +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc->kstat_irqs[cpu]; } #endif +EXPORT_SYMBOL(kstat_irqs_cpu); + diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 64c1c7253da..e6d0a43cc12 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); +extern struct lock_class_key irq_desc_lock_class; +extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern spinlock_t sparse_irq_lock; +extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 801addda3c4..10ad2f87ed9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq) * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = cpumask; + cpumask_copy(&desc->pending_mask, cpumask); } #else - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) */ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { - cpumask_t mask; - if (!irq_can_set_affinity(irq)) return 0; - cpus_and(mask, cpu_online_map, irq_default_affinity); - /* * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpus_intersects(desc->affinity, cpu_online_map)) - mask = desc->affinity; + if (cpumask_any_and(&desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - desc->affinity = mask; - desc->chip->set_affinity(irq, mask); + cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); +set_affinity: + desc->chip->set_affinity(irq, &desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9db681d9581..bd72329e630 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,7 +4,6 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); - cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -19,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(desc->pending_mask))) + if (unlikely(cpumask_empty(&desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -27,8 +26,6 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, desc->pending_mask, cpu_online_map); - /* * If there was a valid mask to work with, please * do the disable, re-program, enable sequence. @@ -41,10 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(!cpus_empty(tmp))) { - desc->chip->set_affinity(irq,tmp); + if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { + cpumask_and(&desc->affinity, + &desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, &desc->affinity); } - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c new file mode 100644 index 00000000000..0178e229699 --- /dev/null +++ b/kernel/irq/numa_migrate.c @@ -0,0 +1,127 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +#include "internals.h" + +static void init_copy_kstat_irqs(struct irq_desc *old_desc, + struct irq_desc *desc, + int cpu, int nr) +{ + unsigned long bytes; + + init_kstat_irqs(desc, cpu, nr); + + if (desc->kstat_irqs != old_desc->kstat_irqs) { + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + + memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes); + } +} + +static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) +{ + if (old_desc->kstat_irqs == desc->kstat_irqs) + return; + + kfree(old_desc->kstat_irqs); + old_desc->kstat_irqs = NULL; +} + +static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) +{ + memcpy(desc, old_desc, sizeof(struct irq_desc)); + desc->cpu = cpu; + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + arch_init_copy_chip_data(old_desc, desc, cpu); +} + +static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) +{ + free_kstat_irqs(old_desc, desc); + arch_free_chip_data(old_desc, desc); +} + +static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, + int cpu) +{ + struct irq_desc *desc; + unsigned int irq; + unsigned long flags; + int node; + + irq = old_desc->irq; + + spin_lock_irqsave(&sparse_irq_lock, flags); + + /* We have to check it to avoid races with another CPU */ + desc = irq_desc_ptrs[irq]; + + if (desc && old_desc != desc) + goto out_unlock; + + node = cpu_to_node(cpu); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n", + irq, cpu, node); + if (!desc) { + printk(KERN_ERR "can not get new irq_desc for moving\n"); + /* still use old one */ + desc = old_desc; + goto out_unlock; + } + init_copy_one_irq_desc(irq, old_desc, desc, cpu); + + irq_desc_ptrs[irq] = desc; + + /* free the old one */ + free_one_irq_desc(old_desc, desc); + kfree(old_desc); + +out_unlock: + spin_unlock_irqrestore(&sparse_irq_lock, flags); + + return desc; +} + +struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +{ + int old_cpu; + int node, old_node; + + /* those all static, do move them */ + if (desc->irq < NR_IRQS_LEGACY) + return desc; + + old_cpu = desc->cpu; + printk(KERN_DEBUG + "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu); + if (old_cpu != cpu) { + node = cpu_to_node(cpu); + old_node = cpu_to_node(old_cpu); + if (old_node != node) + desc = __real_move_irq_desc(desc, cpu); + else + desc->cpu = cpu; + } + + return desc; +} + diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d257e7d6a8a..d2c0e5ee53c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_t new_value; + cpumask_var_t new_value; int err; if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(*new_value)) { + err = -EINVAL; + goto free_cpumask; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) + if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity_usr(irq) ? -EINVAL : count; - - irq_set_affinity(irq, new_value); + err = irq_select_affinity_usr(irq) ? -EINVAL : count; + } else { + irq_set_affinity(irq, new_value); + err = count; + } - return count; +free_cpumask: + free_cpumask_var(new_value); + return err; } static int irq_affinity_proc_open(struct inode *inode, struct file *file) @@ -95,7 +104,7 @@ static ssize_t default_affinity_write(struct file *file, cpumask_t new_value; int err; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; @@ -243,7 +252,11 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for_each_irq_desc(irq, desc) + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + register_irq_proc(irq, desc); + } } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd364c11e56..3738107531f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -91,6 +91,9 @@ static int misrouted_irq(int irq) int i, ok = 0; for_each_irq_desc(i, desc) { + if (!desc) + continue; + if (!i) continue; @@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy) for_each_irq_desc(i, desc) { unsigned int status; + if (!desc) + continue; if (!i) continue; diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e7a7ce3ed0..4fbc456f393 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; +DEFINE_TRACE(sched_kthread_stop); +DEFINE_TRACE(sched_kthread_stop_ret); + struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 46a404173db..c4c7df23f8c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -25,6 +25,7 @@ * Thanks to Arjan van de Ven for coming up with the initial idea of * mapping lock dependencies runtime. */ +#define DISABLE_BRANCH_PROFILING #include <linux/mutex.h> #include <linux/sched.h> #include <linux/delay.h> @@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); -static int lock_contention_point(struct lock_class *class, unsigned long ip) +static int lock_point(unsigned long points[], unsigned long ip) { int i; - for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { - if (class->contention_point[i] == 0) { - class->contention_point[i] = ip; + for (i = 0; i < LOCKSTAT_POINTS; i++) { + if (points[i] == 0) { + points[i] = ip; break; } - if (class->contention_point[i] == ip) + if (points[i] == ip) break; } @@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class) for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) + stats.contending_point[i] += pcs->contending_point[i]; + lock_time_add(&pcs->read_waittime, &stats.read_waittime); lock_time_add(&pcs->write_waittime, &stats.write_waittime); @@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class) memset(cpu_stats, 0, sizeof(struct lock_class_stats)); } memset(class->contention_point, 0, sizeof(class->contention_point)); + memset(class->contending_point, 0, sizeof(class->contending_point)); } static struct lock_class_stats *get_lock_stats(struct lock_class *class) @@ -2999,7 +3004,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - int i, point; + int i, contention_point, contending_point; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) @@ -3023,18 +3028,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) found_it: hlock->waittime_stamp = sched_clock(); - point = lock_contention_point(hlock_class(hlock), ip); + contention_point = lock_point(hlock_class(hlock)->contention_point, ip); + contending_point = lock_point(hlock_class(hlock)->contending_point, + lock->ip); stats = get_lock_stats(hlock_class(hlock)); - if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[point]++; + if (contention_point < LOCKSTAT_POINTS) + stats->contention_point[contention_point]++; + if (contending_point < LOCKSTAT_POINTS) + stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); } static void -__lock_acquired(struct lockdep_map *lock) +__lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; @@ -3083,6 +3092,7 @@ found_it: put_lock_stats(stats); lock->cpu = cpu; + lock->ip = ip; } void lock_contended(struct lockdep_map *lock, unsigned long ip) @@ -3104,7 +3114,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -void lock_acquired(struct lockdep_map *lock) +void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; @@ -3117,7 +3127,7 @@ void lock_acquired(struct lockdep_map *lock) raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - __lock_acquired(lock); + __lock_acquired(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 20dbcbf9c7d..13716b81389 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) static void snprint_time(char *buf, size_t bufsiz, s64 nr) { - unsigned long rem; + s64 div; + s32 rem; nr += 5; /* for display rounding */ - rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); + div = div_s64_rem(nr, 1000, &rem); + snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) @@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (stats->read_holdtime.nr) namelen += 2; - for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + for (i = 0; i < LOCKSTAT_POINTS; i++) { char sym[KSYM_SYMBOL_LEN]; char ip[32]; @@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) stats->contention_point[i], ip, sym); } + for (i = 0; i < LOCKSTAT_POINTS; i++) { + char sym[KSYM_SYMBOL_LEN]; + char ip[32]; + + if (class->contending_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + sprint_symbol(sym, class->contending_point[i]); + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contending_point[i]); + seq_printf(m, "%40s %14lu %29s %s\n", name, + stats->contending_point[i], + ip, sym); + } if (i) { seq_puts(m, "\n"); seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); @@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) static void seq_header(struct seq_file *m) { - seq_printf(m, "lock_stat version 0.2\n"); + seq_printf(m, "lock_stat version 0.3\n"); seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " "%14s %14s\n", diff --git a/kernel/marker.c b/kernel/marker.c index e9c6b2bc940..ea54f264786 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex); */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /* * Note about RCU : @@ -64,11 +65,10 @@ struct marker_entry { void *oldptr; int rcu_pending; unsigned char ptype:1; + unsigned char format_allocated:1; char name[0]; /* Contains name'\0'format'\0' */ }; -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - /** * __mark_empty_function - Empty probe callback * @probe_private: probe private data @@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(void *probe_private, void *call_private, +notrace void __mark_empty_function(void *probe_private, void *call_private, const char *fmt, va_list *args) { } @@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, ...) +notrace void marker_probe_cb(const struct marker *mdata, + void *call_private, ...) { va_list args; char ptype; @@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * sure the teardown of the callbacks can be done correctly when they * are in modules and they insure RCU read coherency. */ - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) va_end(args); } } - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); } EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +static notrace void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...) { va_list args; /* not initialized */ char ptype; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); } - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); } -EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); static void free_old_closure(struct rcu_head *head) { @@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format) e->single.probe_private = NULL; e->multi = NULL; e->ptype = 0; + e->format_allocated = 0; e->refcount = 0; e->rcu_pending = 0; hlist_add_head(&e->hlist, head); @@ -447,6 +449,8 @@ static int remove_marker(const char *name) if (e->single.func != __mark_empty_function) return -EBUSY; hlist_del(&e->hlist); + if (e->format_allocated) + kfree(e->format); /* Make sure the call_rcu has been executed */ if (e->rcu_pending) rcu_barrier_sched(); @@ -457,57 +461,34 @@ static int remove_marker(const char *name) /* * Set the mark_entry format to the format found in the element. */ -static int marker_set_format(struct marker_entry **entry, const char *format) +static int marker_set_format(struct marker_entry *entry, const char *format) { - struct marker_entry *e; - size_t name_len = strlen((*entry)->name) + 1; - size_t format_len = strlen(format) + 1; - - - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) + entry->format = kstrdup(format, GFP_KERNEL); + if (!entry->format) return -ENOMEM; - memcpy(&e->name[0], (*entry)->name, name_len); - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - e->single = (*entry)->single; - e->multi = (*entry)->multi; - e->ptype = (*entry)->ptype; - e->refcount = (*entry)->refcount; - e->rcu_pending = 0; - hlist_add_before(&e->hlist, &(*entry)->hlist); - hlist_del(&(*entry)->hlist); - /* Make sure the call_rcu has been executed */ - if ((*entry)->rcu_pending) - rcu_barrier_sched(); - kfree(*entry); - *entry = e; + entry->format_allocated = 1; + trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); + entry->name, entry->format); return 0; } /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem, +static int set_marker(struct marker_entry *entry, struct marker *elem, int active) { - int ret; - WARN_ON(strcmp((*entry)->name, elem->name) != 0); + int ret = 0; + WARN_ON(strcmp(entry->name, elem->name) != 0); - if ((*entry)->format) { - if (strcmp((*entry)->format, elem->format) != 0) { + if (entry->format) { + if (strcmp(entry->format, elem->format) != 0) { printk(KERN_NOTICE "Format mismatch for probe %s " "(%s), marker (%s)\n", - (*entry)->name, - (*entry)->format, + entry->name, + entry->format, elem->format); return -EPERM; } @@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * pass from a "safe" callback (with argument) to an "unsafe" * callback (does not set arguments). */ - elem->call = (*entry)->call; + elem->call = entry->call; /* * Sanity check : * We only update the single probe private data when the ptr is * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) */ WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private - != (*entry)->single.probe_private && - !elem->ptype); - elem->single.probe_private = (*entry)->single.probe_private; + && elem->single.probe_private != entry->single.probe_private + && !elem->ptype); + elem->single.probe_private = entry->single.probe_private; /* * Make sure the private data is valid when we update the * single probe ptr. */ smp_wmb(); - elem->single.func = (*entry)->single.func; + elem->single.func = entry->single.func; /* * We also make sure that the new probe callbacks array is consistent * before setting a pointer to it. */ - rcu_assign_pointer(elem->multi, (*entry)->multi); + rcu_assign_pointer(elem->multi, entry->multi); /* * Update the function or multi probe array pointer before setting the * ptype. */ smp_wmb(); - elem->ptype = (*entry)->ptype; + elem->ptype = entry->ptype; + + if (elem->tp_name && (active ^ elem->state)) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + + if (active) { + /* + * try_module_get should always succeed because we hold + * lock_module() to get the tp_cb address. + */ + ret = try_module_get(__module_text_address( + (unsigned long)elem->tp_cb)); + BUG_ON(!ret); + ret = tracepoint_probe_register_noupdate( + elem->tp_name, + elem->tp_cb); + } else { + ret = tracepoint_probe_unregister_noupdate( + elem->tp_name, + elem->tp_cb); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address( + (unsigned long)elem->tp_cb)); + } + } elem->state = active; - return 0; + return ret; } /* @@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, */ static void disable_marker(struct marker *elem) { + int ret; + /* leave "call" as is. It is known statically. */ + if (elem->tp_name && elem->state) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + ret = tracepoint_probe_unregister_noupdate(elem->tp_name, + elem->tp_cb); + WARN_ON(ret); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address((unsigned long)elem->tp_cb)); + } elem->state = 0; elem->single.func = __mark_empty_function; /* Update the function before setting the ptype */ @@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin, for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); if (mark_entry) { - set_marker(&mark_entry, iter, - !!mark_entry->refcount); + set_marker(mark_entry, iter, !!mark_entry->refcount); /* * ignore error, continue */ @@ -629,6 +656,7 @@ static void marker_update_probes(void) marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ module_update_markers(); + tracepoint_probe_update_all(); } /** @@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format, ret = PTR_ERR(entry); } else if (format) { if (!entry->format) - ret = marker_set_format(&entry, format); + ret = marker_set_format(entry, format); else if (strcmp(entry->format, format)) ret = -EPERM; } @@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format, goto end; } mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker(name); - WARN_ON(!entry); + if (!entry) + goto end; if (entry->rcu_pending) rcu_barrier_sched(); entry->oldptr = old; @@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name, rcu_barrier_sched(); old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) @@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, rcu_barrier_sched(); old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); - WARN_ON(!entry); + if (!entry) + goto end; if (entry->rcu_pending) rcu_barrier_sched(); entry->oldptr = old; @@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, if (!e->ptype) { if (num == 0 && e->single.func == probe) return e->single.probe_private; - else - break; } else { struct marker_probe_closure *closure; int match = 0; @@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, return closure[i].probe_private; } } + break; } } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(marker_get_private_data); + +#ifdef CONFIG_MODULES + +int marker_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + case MODULE_STATE_GOING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + } + return 0; +} + +struct notifier_block marker_module_nb = { + .notifier_call = marker_module_notify, + .priority = 0, +}; + +static int init_markers(void) +{ + return register_module_notifier(&marker_module_nb); +} +__initcall(init_markers); + +#endif /* CONFIG_MODULES */ diff --git a/kernel/module.c b/kernel/module.c index 1f4cc00e0c2..dd2a54155b5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod, struct mod_debug *debug; unsigned int num_debug; -#ifdef CONFIG_MARKERS - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); -#endif debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); dynamic_printk_setup(debug, num_debug); - -#ifdef CONFIG_TRACEPOINTS - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); -#endif } /* sechdrs[0].sh_size is always zero */ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", sizeof(*mseg), &num_mcount); - ftrace_init_module(mseg, mseg + num_mcount); + ftrace_init_module(mod, mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr) /* Is this a valid kernel address? */ -struct module *__module_text_address(unsigned long addr) +__notrace_funcgraph struct module *__module_text_address(unsigned long addr) { struct module *mod; diff --git a/kernel/mutex.c b/kernel/mutex.c index 12c779dc65d..4f45d4b658e 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static void noinline __sched +static __used noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count); /*** @@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /*** * mutex_unlock - release the mutex @@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } done: - lock_acquired(&lock->dep_map); + lock_acquired(&lock->dep_map, ip); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); @@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) /* * Release the lock, slowpath: */ -static noinline void +static __used noinline void __mutex_unlock_slowpath(atomic_t *lock_count) { __mutex_unlock_common_slowpath(lock_count, 1); @@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); -static noinline void __sched +static __used noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); diff --git a/kernel/notifier.c b/kernel/notifier.c index 4282c0a40a5..61d5aa5eced 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, while (nb && nr_to_call) { next_nb = rcu_dereference(nb->next); + +#ifdef CONFIG_DEBUG_NOTIFIERS + if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { + WARN(1, "Invalid notifier called!"); + nb = next_nb; + continue; + } +#endif ret = nb->notifier_call(nb, val, v); if (nr_calls) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4e5288a831d..157de3a4783 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -58,21 +58,21 @@ void thread_group_cputime( struct task_struct *tsk, struct task_cputime *times) { - struct signal_struct *sig; + struct task_cputime *totals, *tot; int i; - struct task_cputime *tot; - sig = tsk->signal; - if (unlikely(!sig) || !sig->cputime.totals) { + totals = tsk->signal->cputime.totals; + if (!totals) { times->utime = tsk->utime; times->stime = tsk->stime; times->sum_exec_runtime = tsk->se.sum_exec_runtime; return; } + times->stime = times->utime = cputime_zero; times->sum_exec_runtime = 0; for_each_possible_cpu(i) { - tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + tot = per_cpu_ptr(totals, i); times->utime = cputime_add(times->utime, tot->utime); times->stime = cputime_add(times->stime, tot->stime); times->sum_exec_runtime += tot->sum_exec_runtime; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index c9d74083746..f77d3819ef5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,7 +22,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> -#include <linux/ftrace.h> #include "power.h" @@ -257,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error, ftrace_save; + int error; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -370,11 +367,10 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error, ftrace_save; + int error; pm_prepare_console(); suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: - __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error, ftrace_save; + int error; if (!hibernation_ops) return -ENOSYS; @@ -417,7 +412,6 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -452,7 +446,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index b8f7ce9473e..613f16941b8 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,7 +22,6 @@ #include <linux/freezer.h> #include <linux/vmstat.h> #include <linux/syscalls.h> -#include <linux/ftrace.h> #include "power.h" @@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error, ftrace_save; + int error; if (!suspend_ops) return -ENOSYS; @@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/profile.c b/kernel/profile.c index dc41827fbfe..4cb7d68fed8 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,7 +442,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); + int len = cpumask_scnprintf(page, count, (cpumask_t *)data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file, unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; @@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = { }; #ifdef CONFIG_SMP -static inline void profile_nop(void *unused) +static void profile_nop(void *unused) { } diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 37f72e55154..c03ca3e6191 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) /* OK, time to rat on our buddy... */ - printk(KERN_ERR "RCU detected CPU stalls:"); + printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for_each_possible_cpu(cpu) { if (cpu_isset(cpu, rcp->cpumask)) printk(" %d", cpu); @@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp) { unsigned long flags; - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", smp_processor_id(), jiffies, jiffies - rcp->gp_start); dump_stack(); @@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } diff --git a/kernel/sched.c b/kernel/sched.c index e4bb1dd7b30..b309027bf9e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -118,6 +118,12 @@ */ #define RUNTIME_INF ((u64)~0ULL) +DEFINE_TRACE(sched_wait_task); +DEFINE_TRACE(sched_wakeup); +DEFINE_TRACE(sched_wakeup_new); +DEFINE_TRACE(sched_switch); +DEFINE_TRACE(sched_migrate_task); + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -261,6 +267,10 @@ struct task_group { struct cgroup_subsys_state css; #endif +#ifdef CONFIG_USER_SCHED + uid_t uid; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -286,6 +296,12 @@ struct task_group { #ifdef CONFIG_USER_SCHED +/* Helper function to pass uid information to create_sched_user() */ +void set_tg_uid(struct user_struct *user) +{ + user->tg->uid = user->uid; +} + /* * Root task group. * Every UID task group (including init_task_group aka UID-0) will @@ -481,14 +497,14 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; @@ -703,45 +719,18 @@ static __read_mostly char *sched_feat_names[] = { #undef SCHED_FEAT -static int sched_feat_open(struct inode *inode, struct file *filp) +static int sched_feat_show(struct seq_file *m, void *v) { - filp->private_data = inode->i_private; - return 0; -} - -static ssize_t -sched_feat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf; - int r = 0; - int len = 0; int i; for (i = 0; sched_feat_names[i]; i++) { - len += strlen(sched_feat_names[i]); - len += 4; - } - - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; sched_feat_names[i]; i++) { - if (sysctl_sched_features & (1UL << i)) - r += sprintf(buf + r, "%s ", sched_feat_names[i]); - else - r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); } + seq_puts(m, "\n"); - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; + return 0; } static ssize_t @@ -786,10 +775,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return cnt; } +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .read = sched_feat_read, - .write = sched_feat_write, + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static __init int sched_init_debug(void) @@ -1474,27 +1470,13 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - int boost = 0; unsigned long shares; unsigned long rq_weight; if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[cpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - if (unlikely(rq_weight > sd_rq_weight)) - rq_weight = sd_rq_weight; + rq_weight = tg->cfs_rq[cpu]->rq_weight; /* * \Sum shares * rq_weight @@ -1502,7 +1484,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, * \Sum rq_weight * */ - shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); if (abs(shares - tg->se[cpu]->load.weight) > @@ -1511,11 +1493,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + tg->cfs_rq[cpu]->shares = shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1529,13 +1507,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long rq_weight = 0; + unsigned long weight, rq_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; + for_each_cpu(i, sched_domain_span(sd)) { + /* + * If there are currently no tasks on the cpu pretend there + * is one of average load so that when a new task gets to + * run here it will not get delayed by group starvation. + */ + weight = tg->cfs_rq[i]->load.weight; + if (!weight) + weight = NICE_0_LOAD; + + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1545,10 +1533,7 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - if (!rq_weight) - rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@ -1612,6 +1597,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + ret = 1; + } else + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + } + return ret; +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2079,15 +2097,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2119,17 +2139,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2171,7 +2188,6 @@ static int sched_balance_self(int cpu, int flag) update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -2180,14 +2196,13 @@ static int sched_balance_self(int cpu, int flag) continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2196,10 +2211,10 @@ static int sched_balance_self(int cpu, int flag) /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@ -2244,7 +2259,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@ -2292,7 +2307,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@ -2812,40 +2827,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); - ret = 1; - } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); - } - return ret; -} - -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -2858,7 +2839,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; @@ -2924,7 +2905,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@ -3099,7 +3080,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3135,10 +3116,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3147,13 +3129,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@ -3264,8 +3241,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3280,8 +3257,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3420,16 +3397,16 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@ -3459,7 +3436,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -3467,7 +3444,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3527,8 +3504,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@ -3545,7 +3522,8 @@ redo: /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@ -3620,7 +3598,7 @@ out: */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3629,7 +3607,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3673,8 +3651,8 @@ redo: double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } @@ -3707,9 +3685,12 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = -1; + int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3720,7 +3701,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3735,6 +3716,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* @@ -3772,7 +3754,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@ -3791,10 +3773,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@ -3822,7 +3803,7 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@ -3836,7 +3817,7 @@ int select_nohz_load_balancer(int stop_tick) } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -3849,10 +3830,10 @@ int select_nohz_load_balancer(int stop_tick) } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@ -3880,7 +3861,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3905,7 +3890,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3939,6 +3924,8 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* @@ -3963,12 +3950,13 @@ static void run_rebalance_domains(struct softirq_action *h) */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -4006,7 +3994,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@ -4019,7 +4007,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4031,7 +4019,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@ -4041,7 +4029,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@ -4203,7 +4191,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); - account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@ -4339,7 +4326,7 @@ void __kprobes sub_preempt_count(int val) /* * Underflow? */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) return; /* * Is the spinlock portion underflowing? @@ -5400,10 +5387,9 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@ -5425,6 +5411,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) @@ -5434,37 +5428,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@ -5477,17 +5475,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@ -5504,7 +5505,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@ -5523,19 +5524,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** @@ -5877,7 +5883,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5896,6 +5902,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; + ftrace_graph_init_task(idle); } /* @@ -5903,9 +5910,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@ -5960,7 +5967,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@ -5968,13 +5975,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@ -5982,15 +5989,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -6032,7 +6039,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@ -6126,54 +6133,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) /* * Figure out where task on dead CPU should go, use force if necessary. - * NOTE: interrupts should be disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - cpumask_t mask; - struct rq *rq; int dest_cpu; + /* FIXME: Use cpumask_of_node here. */ + cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); + const struct cpumask *nodemask = &_nodemask; + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - do { - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* @@ -6185,7 +6184,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@ -6475,7 +6474,7 @@ static void set_rq_online(struct rq *rq) if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@ -6495,7 +6494,7 @@ static void set_rq_offline(struct rq *rq) class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@ -6536,7 +6535,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@ -6550,7 +6549,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@ -6600,7 +6599,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@ -6638,36 +6637,14 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG -static inline const char *sd_level_to_string(enum sched_domain_level lvl) -{ - switch (lvl) { - case SD_LV_NONE: - return "NONE"; - case SD_LV_SIBLING: - return "SIBLING"; - case SD_LV_MC: - return "MC"; - case SD_LV_CPU: - return "CPU"; - case SD_LV_NODE: - return "NODE"; - case SD_LV_ALLNODES: - return "ALLNODES"; - case SD_LV_MAX: - return "MAX"; - - } - return "MAX"; -} - static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); - cpus_clear(*groupmask); + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6679,14 +6656,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", - str, sd_level_to_string(sd->level)); + printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -6706,31 +6682,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6738,7 +6715,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@ -6748,8 +6725,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@ -6762,7 +6738,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6770,7 +6746,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@ -6801,7 +6777,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@ -6816,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; } if (~cflags & pflags) return 0; @@ -6823,6 +6801,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +static void free_rootdomain(struct root_domain *rd) +{ + cpupri_cleanup(&rd->cpupri); + + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@ -6832,38 +6820,63 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri, true); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto free_rd; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; + return 0; - cpupri_init(&rd->cpupri); +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +free_rd: + kfree(rd); + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@ -6875,7 +6888,10 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@ -6917,19 +6933,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } /* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; +static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, cpu_isolated_map); return 1; } @@ -6938,42 +6947,43 @@ __setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@ -7037,9 +7047,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static void sched_domain_node_span(int node, cpumask_t *span) +static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; + /* FIXME: use cpumask_of_node() */ node_to_cpumask_ptr(nodemask, node); int i; @@ -7061,18 +7072,33 @@ static void sched_domain_node_span(int node, cpumask_t *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + +/* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -7081,56 +7107,55 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC + /* FIXME: Use cpu_coregroup_mask. */ *mask = cpu_coregroup_map(cpu); cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@ -7144,19 +7169,21 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; + /* FIXME: use cpumask_of_node */ + node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, pnodemask, cpu_map); + group = cpumask_first(nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@ -7168,11 +7195,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + sd = &per_cpu(phys_domains, j).sd; + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@ -7189,11 +7216,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@ -7202,10 +7230,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + /* FIXME: Use cpumask_of_node */ + node_to_cpumask_ptr(pnodemask, i); - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + cpus_and(*nodemask, *pnodemask, *cpu_map); + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@ -7223,7 +7252,8 @@ next_sg: } } #else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@ -7249,7 +7279,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@ -7314,40 +7344,6 @@ SD_INIT_FUNC(CPU) SD_INIT_FUNC(MC) #endif -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_ALLOC 1 -#define SCHED_CPUMASK_FREE(v) kfree(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -#else -#define SCHED_CPUMASK_ALLOC 0 -#define SCHED_CPUMASK_FREE(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@ -7387,17 +7383,38 @@ static void set_domain_attribute(struct sched_domain *sd, * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const cpumask_t *cpu_map, +static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@ -7405,55 +7422,37 @@ static int __build_sched_domains(const cpumask_t *cpu_map, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; + goto free_sched_groups; } -#if SCHED_CPUMASK_ALLOC - /* get space for all scratch cpumask variables */ - allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); #ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } -#endif - tmpmask = (cpumask_t *)allmasks; - - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); + /* FIXME: use cpumask_of_node */ *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7463,18 +7462,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@ -7482,11 +7482,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); + *sched_domain_span(sd) = cpu_coregroup_map(i); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7494,11 +7495,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7507,13 +7508,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@ -7524,13 +7522,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - + for_each_cpu(i, cpu_map) { + /* FIXME: Use cpu_coregroup_mask */ *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@ -7541,12 +7537,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@ -7557,8 +7551,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@ -7567,58 +7559,58 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); - cpus_clear(*covered); + cpumask_clear(covered); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, pnodemask); + if (cpumask_empty(tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@ -7626,9 +7618,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@ -7637,22 +7629,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -7664,56 +7656,87 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } - SCHED_CPUMASK_FREE((void *)allmasks); - return 0; + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - SCHED_CPUMASK_FREE((void *)allmasks); - kfree(rd); - return -ENOMEM; + free_rootdomain(rd); + goto free_tmpmask; #endif } -static int build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } -static cpumask_t *doms_cur; /* current sched domains */ +static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ -static cpumask_t fallback_doms; +static cpumask_var_t fallback_doms; -void __attribute__((weak)) arch_update_cpu_topology(void) +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) { + return 0; } /* @@ -7721,16 +7744,16 @@ void __attribute__((weak)) arch_update_cpu_topology(void) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + doms_cur = fallback_doms; + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -7738,8 +7761,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@ -7748,17 +7771,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map, * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ -static void detach_destroy_domains(const cpumask_t *cpu_map) +static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; - unregister_sched_domain_sysctl(); - - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7783,7 +7805,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@ -7797,28 +7819,33 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; + int new_topology; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@ -7830,15 +7857,15 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; + doms_new = fallback_doms; + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + for (j = 0; j < ndoms_cur && !new_topology; j++) { + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } @@ -7850,7 +7877,7 @@ match2: } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@ -7990,7 +8017,9 @@ static int update_runtime(struct notifier_block *nfb, void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -7999,10 +8028,10 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -8017,9 +8046,13 @@ void __init sched_init_smp(void) init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) @@ -8334,6 +8367,15 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ + scheduler_running = 1; } @@ -8492,7 +8534,7 @@ static int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se, *parent_se; + struct sched_entity *se; struct rq *rq; int i; @@ -8508,18 +8550,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); if (!se) goto err; - parent_se = parent ? parent->se[i] : NULL; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; @@ -8580,7 +8621,7 @@ static int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se, *parent_se; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -8597,18 +8638,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - rt_rq = kmalloc_node(sizeof(struct rt_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) goto err; - rt_se = kmalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); if (!rt_se) goto err; - parent_se = parent ? parent->rt_se[i] : NULL; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; @@ -9251,11 +9291,12 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ -/* track cpu usage of a group of tasks */ +/* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct cpuacct *parent; }; struct cgroup_subsys cpuacct_subsys; @@ -9289,6 +9330,9 @@ static struct cgroup_subsys_state *cpuacct_create( return ERR_PTR(-ENOMEM); } + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + return &ca->css; } @@ -9368,14 +9412,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) static void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int cpu; if (!cpuacct_subsys.active) return; + cpu = task_cpu(tsk); ca = task_ca(tsk); - if (ca) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); + for (; ca; ca = ca->parent) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 52154fefab7..018b7be1db2 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -67,24 +67,21 @@ static int convert_prio(int prio) * Returns: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - cpumask_t *lowest_mask) + struct cpumask *lowest_mask) { int idx = 0; int task_pri = convert_prio(p->prio); for_each_cpupri_active(cp->pri_active, idx) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - cpumask_t mask; if (idx >= task_pri) break; - cpus_and(mask, p->cpus_allowed, vec->mask); - - if (cpus_empty(mask)) + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - *lowest_mask = mask; + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } @@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); - cpu_clear(cpu, vec->mask); + cpumask_clear_cpu(cpu, vec->mask); spin_unlock_irqrestore(&vec->lock, flags); } @@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_lock_irqsave(&vec->lock, flags); - cpu_set(cpu, vec->mask); + cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); @@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem * - * Returns: (void) + * Returns: -ENOMEM if memory fails. */ -void cpupri_init(struct cpupri *cp) +int cpupri_init(struct cpupri *cp, bool bootmem) { int i; @@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp) spin_lock_init(&vec->lock); vec->count = 0; - cpus_clear(vec->mask); + if (bootmem) + alloc_bootmem_cpumask_var(&vec->mask); + else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; } for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; } +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index f25811b0f93..642a94ef8a0 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -14,7 +14,7 @@ struct cpupri_vec { spinlock_t lock; int count; - cpumask_t mask; + cpumask_var_t mask; }; struct cpupri { @@ -27,7 +27,8 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, cpumask_t *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -void cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) #define cpupri_init() do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 26ed8e3d1c1..4293cfa9681 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#ifdef CONFIG_FAIR_GROUP_SCHED +static void print_cfs_group_stats(struct seq_file *m, int cpu, + struct task_group *tg) +{ + struct sched_entity *se = tg->se[cpu]; + if (!se) + return; + +#define P(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define PN(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + + PN(se->exec_start); + PN(se->vruntime); + PN(se->sum_exec_runtime); +#ifdef CONFIG_SCHEDSTATS + PN(se->wait_start); + PN(se->sleep_start); + PN(se->block_start); + PN(se->sleep_max); + PN(se->block_max); + PN(se->exec_max); + PN(se->slice_max); + PN(se->wait_max); + PN(se->wait_sum); + P(se->wait_count); +#endif + P(se->load.weight); +#undef PN +#undef P +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) char path[128] = ""; - struct cgroup *cgroup = NULL; struct task_group *tg = cfs_rq->tg; - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); + cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) + { + uid_t uid = cfs_rq->tg->uid; + SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid); + } #else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); #endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); #endif + print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) char path[128] = ""; - struct cgroup *cgroup = NULL; struct task_group *tg = rt_rq->tg; - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); + cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); #else @@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 98345e45b05..08ffffd4a41 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1017,14 +1017,13 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; @@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p) if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@ -1240,13 +1238,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync) * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d9ba9d5f99d..1bbd9901401 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq) if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq) /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) } #ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) return rt_rq->rt_throttled; } -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq) int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq) /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); - spin_lock(&rt_rq->rt_runtime_lock); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); +free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ @@ -909,15 +914,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int double_lock_balance(struct rq *this_rq, struct rq *busiest); -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); - static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -956,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) return next; } -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { @@ -976,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -991,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task) * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@ -1001,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task) * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@ -1016,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t domain_mask; int best_cpu; - cpus_and(domain_mask, sd->span, *lowest_mask); + cpumask_and(&domain_mask, sched_domain_span(sd), + lowest_mask); best_cpu = pick_optimal_cpu(this_cpu, &domain_mask); @@ -1057,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@ -1179,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq) next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1308,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@ -1331,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } @@ -1374,6 +1377,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} #endif /* CONFIG_SMP */ /* @@ -1544,3 +1555,4 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a2b02..5fcf0e18458 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { diff --git a/kernel/signal.c b/kernel/signal.c index 4530fc65445..e9afe63da24 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,6 +41,8 @@ static struct kmem_cache *sigqueue_cachep; +DEFINE_TRACE(sched_signal_send); + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index dc0b3be6b7d..1ab790c67b1 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024; /* * Zero means infinite timeout - no checking done: */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; unsigned long __read_mostly sysctl_hung_task_warnings = 10; diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8f7d1..5fc3a0cfb99 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms) struct task_cputime cputime; cputime_t cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime(current, &cputime); + spin_lock_irq(¤t->sighand->siglock); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3d56fe7570d..c83f566e940 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif +#ifdef CONFIG_TRACING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_MODULES { .ctl_name = KERN_MODPROBE, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bd6be76303c..6d7dc4ec4aa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask) if (!data) return -ENOMEM; nla_strlcpy(data, na, len); - ret = cpulist_parse(data, *mask); + ret = cpulist_parse(data, mask); kfree(data); return ret; } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f8d968063ce..ea2f48af83c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -166,6 +166,8 @@ static void clockevents_notify_released(void) void clockevents_register_device(struct clock_event_device *dev) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(!dev->cpumask); + /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f98a1b7b16e..9590af2327b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask) */ cpu = first_cpu(mask); td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(mask); + td->evtdev->broadcast(&mask); } } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434b43c..f8372be7412 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, - const cpumask_t *cpumask) + const struct cpumask *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; @@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpus_equal(newdev->cpumask, *cpumask)) - irq_set_affinity(newdev->irq, *cpumask); + if (!cpumask_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current @@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - if (!cpu_isset(cpu, newdev->cpumask)) + if (!cpumask_test_cpu(cpu, newdev->cpumask)) goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) goto out_bc; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab4..70f872c71f4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void) if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle) /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); /* * We stopped the tick in idle. Update process times would miss the diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 33dbefd471e..bde6f03512d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,18 +3,34 @@ # select HAVE_FUNCTION_TRACER: # +config USER_STACKTRACE_SUPPORT + bool + config NOP_TRACER bool config HAVE_FUNCTION_TRACER bool +config HAVE_FUNCTION_GRAPH_TRACER + bool + +config HAVE_FUNCTION_TRACE_MCOUNT_TEST + bool + help + This gets selected when the arch tests the function_trace_stop + variable at the mcount call site. Otherwise, this variable + is tested by the called function. + config HAVE_DYNAMIC_FTRACE bool config HAVE_FTRACE_MCOUNT_RECORD bool +config HAVE_HW_BRANCH_TRACER + bool + config TRACER_MAX_TRACE bool @@ -47,6 +63,20 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER + depends on FUNCTION_TRACER + default y + help + Enable the kernel to trace a function at both its return + and its entry. + It's first purpose is to trace the duration of functions and + draw a call graph for each thread with some informations like + the return value. + This is done by setting the current return address on the current + task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -138,6 +168,70 @@ config BOOT_TRACER selected, because the self-tests are an initcall as well and that would invalidate the boot trace. ) +config TRACE_BRANCH_PROFILING + bool "Trace likely/unlikely profiler" + depends on DEBUG_KERNEL + select TRACING + help + This tracer profiles all the the likely and unlikely macros + in the kernel. It will display the results in: + + /debugfs/tracing/profile_annotated_branch + + Note: this will add a significant overhead, only turn this + on if you need to profile the system's use of these macros. + + Say N if unsure. + +config PROFILE_ALL_BRANCHES + bool "Profile all if conditionals" + depends on TRACE_BRANCH_PROFILING + help + This tracer profiles all branch conditions. Every if () + taken in the kernel is recorded whether it hit or miss. + The results will be displayed in: + + /debugfs/tracing/profile_branch + + This configuration, when enabled, will impose a great overhead + on the system. This should only be enabled when the system + is to be analyzed + + Say N if unsure. + +config TRACING_BRANCHES + bool + help + Selected by tracers that will trace the likely and unlikely + conditions. This prevents the tracers themselves from being + profiled. Profiling the tracing infrastructure can only happen + when the likelys and unlikelys are not being traced. + +config BRANCH_TRACER + bool "Trace likely/unlikely instances" + depends on TRACE_BRANCH_PROFILING + select TRACING_BRANCHES + help + This traces the events of likely and unlikely condition + calls in the kernel. The difference between this and the + "Trace likely/unlikely profiler" is that this is not a + histogram of the callers, but actually places the calling + events into a running trace buffer to see when and where the + events happened, as well as their results. + + Say N if unsure. + +config POWER_TRACER + bool "Trace power consumption behavior" + depends on DEBUG_KERNEL + depends on X86 + select TRACING + help + This tracer helps developers to analyze and optimize the kernels + power management decisions, specifically the C-state and P-state + behavior. + + config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER @@ -157,6 +251,14 @@ config STACK_TRACER Say N if unsure. +config BTS_TRACER + depends on HAVE_HW_BRANCH_TRACER + bool "Trace branches" + select TRACING + help + This tracer records all branches on the system in a circular + buffer giving access to the last N branches for each cpu. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c8228b1a49e..62dc561b667 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +# If unlikely tracing is enabled, do not trace these files +ifdef CONFIG_TRACING_BRANCHES +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +endif + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o @@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_BTS_TRACER) += trace_bts.o +obj-$(CONFIG_POWER_TRACER) += trace_power.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 78db083390f..a12f80efcea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -47,6 +47,13 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* set when tracing only a pid */ +struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +/* Quick disabling of function tracer. */ +int function_trace_stop; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); +static DEFINE_MUTEX(ftrace_start_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { @@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) }; } +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +{ + if (!test_tsk_trace_trace(current)) + return; + + ftrace_pid_function(ip, parent_ip); +} + +static void set_ftrace_pid_function(ftrace_func_t func) +{ + /* do not set ftrace_pid_function to itself! */ + if (func != ftrace_pid_func) + ftrace_pid_function = func; +} + /** * clear_ftrace_function - reset the ftrace function * @@ -88,7 +113,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; + __ftrace_trace_function = ftrace_stub; + ftrace_pid_function = ftrace_stub; +} + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST +/* + * For those archs that do not test ftrace_trace_stop in their + * mcount call site, we need to do it from C. + */ +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +{ + if (function_trace_stop) + return; + + __ftrace_trace_function(ip, parent_ip); } +#endif static int __register_ftrace_function(struct ftrace_ops *ops) { @@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops) ftrace_list = ops; if (ftrace_enabled) { + ftrace_func_t func; + + if (ops->next == &ftrace_list_end) + func = ops->func; + else + func = ftrace_list_func; + + if (ftrace_pid_trace) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + /* * For one func, simply call it directly. * For more than one func, call the chain. */ - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; + ftrace_trace_function = ftrace_test_stop_func; +#endif } spin_unlock(&ftrace_lock); @@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) { /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_list->next == &ftrace_list_end) { + ftrace_func_t func = ftrace_list->func; + + if (ftrace_pid_trace) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; +#endif + } } out: @@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return ret; } +static void ftrace_update_pid_func(void) +{ + ftrace_func_t func; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + if (ftrace_trace_function == ftrace_stub) + goto out; + + func = ftrace_trace_function; + + if (ftrace_pid_trace) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } else { + if (func == ftrace_pid_func) + func = ftrace_pid_function; + } + +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; +#endif + + out: + spin_unlock(&ftrace_lock); +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD @@ -182,6 +277,8 @@ enum { FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_ENABLE_MCOUNT = (1 << 3), FTRACE_DISABLE_MCOUNT = (1 << 4), + FTRACE_START_FUNC_RET = (1 << 5), + FTRACE_STOP_FUNC_RET = (1 << 6), }; static int ftrace_filtered; @@ -308,7 +405,7 @@ ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *rec; - if (!ftrace_enabled || ftrace_disabled) + if (ftrace_disabled) return NULL; rec = ftrace_alloc_dyn_node(ip); @@ -322,14 +419,51 @@ ftrace_record_ip(unsigned long ip) return rec; } -#define FTRACE_ADDR ((long)(ftrace_caller)) +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + +static void ftrace_bug(int failed, unsigned long ip) +{ + switch (failed) { + case -EFAULT: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case -EINVAL: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" actual: ", (unsigned char *)ip); + printk(KERN_CONT "\n"); + break; + case -EPERM: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); + } +} + static int -__ftrace_replace_code(struct dyn_ftrace *rec, - unsigned char *nop, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ip, fl; - unsigned char *call, *old, *new; + unsigned long ftrace_addr; + + ftrace_addr = (unsigned long)ftrace_caller; ip = rec->ip; @@ -388,34 +522,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } - call = ftrace_call_replace(ip, FTRACE_ADDR); - - if (rec->flags & FTRACE_FL_ENABLED) { - old = nop; - new = call; - } else { - old = call; - new = nop; - } - - return ftrace_modify_code(ip, old, new); + if (rec->flags & FTRACE_FL_ENABLED) + return ftrace_make_call(rec, ftrace_addr); + else + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) { int i, failed; - unsigned char *nop = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - nop = ftrace_nop_replace(); - for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; - /* don't modify code that has already faulted */ - if (rec->flags & FTRACE_FL_FAILED) + /* + * Skip over free records and records that have + * failed. + */ + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED) continue; /* ignore updates to this record's mcount site */ @@ -426,68 +554,30 @@ static void ftrace_replace_code(int enable) unfreeze_record(rec); } - failed = __ftrace_replace_code(rec, nop, enable); + failed = __ftrace_replace_code(rec, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); - } + } else + ftrace_bug(failed, rec->ip); } } } } -static void print_ip_ins(const char *fmt, unsigned char *p) -{ - int i; - - printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); -} - static int -ftrace_code_disable(struct dyn_ftrace *rec) +ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { unsigned long ip; - unsigned char *nop, *call; int ret; ip = rec->ip; - nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, mcount_addr); - - ret = ftrace_modify_code(ip, call, nop); + ret = ftrace_make_nop(mod, rec, mcount_addr); if (ret) { - switch (ret) { - case -EFAULT: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); - break; - case -EINVAL: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace failed to modify "); - print_ip_sym(ip); - print_ip_ins(" expected: ", call); - print_ip_ins(" actual: ", (unsigned char *)ip); - print_ip_ins(" replace: ", nop); - printk(KERN_CONT "\n"); - break; - case -EPERM: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on writing "); - print_ip_sym(ip); - break; - default: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); - } - + ftrace_bug(ret, ip); rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -506,6 +596,11 @@ static int __ftrace_modify_code(void *data) if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); + if (*command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (*command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + return 0; } @@ -515,43 +610,43 @@ static void ftrace_run_update_code(int command) } static ftrace_func_t saved_ftrace_func; -static int ftrace_start; -static DEFINE_MUTEX(ftrace_start_lock); +static int ftrace_start_up; -static void ftrace_startup(void) +static void ftrace_startup_enable(int command) { - int command = 0; - - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftrace_start_lock); - ftrace_start++; - command |= FTRACE_ENABLE_CALLS; - if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; command |= FTRACE_UPDATE_TRACE_FUNC; } if (!command || !ftrace_enabled) - goto out; + return; ftrace_run_update_code(command); - out: - mutex_unlock(&ftrace_start_lock); } -static void ftrace_shutdown(void) +static void ftrace_startup(int command) { - int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftrace_start_lock); + ftrace_start_up++; + command |= FTRACE_ENABLE_CALLS; + + ftrace_startup_enable(command); + + mutex_unlock(&ftrace_start_lock); +} + +static void ftrace_shutdown(int command) +{ if (unlikely(ftrace_disabled)) return; mutex_lock(&ftrace_start_lock); - ftrace_start--; - if (!ftrace_start) + ftrace_start_up--; + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -577,8 +672,8 @@ static void ftrace_startup_sysctl(void) mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; - /* ftrace_start is true if we want ftrace running */ - if (ftrace_start) + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); @@ -593,8 +688,8 @@ static void ftrace_shutdown_sysctl(void) return; mutex_lock(&ftrace_start_lock); - /* ftrace_start is true if ftrace is running */ - if (ftrace_start) + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); @@ -605,7 +700,7 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ftrace_update_code(void) +static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p, *t; cycle_t start, stop; @@ -622,7 +717,7 @@ static int ftrace_update_code(void) list_del_init(&p->list); /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { + if (ftrace_code_disable(mod, p)) { p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; } else @@ -690,7 +785,6 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - loff_t pos; struct ftrace_page *pg; unsigned idx; unsigned flags; @@ -715,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; + } else { + iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -737,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } spin_unlock(&ftrace_lock); - iter->pos = *pos; - return rec; } @@ -746,13 +840,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; - loff_t l = -1; - if (*pos > iter->pos) - *pos = iter->pos; + if (*pos > 0) { + if (iter->idx < 0) + return p; + (*pos)--; + iter->idx--; + } - l = *pos; - p = t_next(m, p, &l); + p = t_next(m, p, pos); return p; } @@ -763,21 +859,15 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { - struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; if (!rec) return 0; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - ret = seq_printf(m, "%s\n", str); - if (ret < 0) { - iter->pos--; - iter->idx--; - } + seq_printf(m, "%s\n", str); return 0; } @@ -803,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; - iter->pos = 0; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -890,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->pos = 0; iter->flags = enable ? FTRACE_ITER_FILTER : FTRACE_ITER_NOTRACE; @@ -1181,7 +1269,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftrace_start_lock); - if (ftrace_start && ftrace_enabled) + if (ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); @@ -1233,12 +1321,233 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; -static __init int ftrace_init_debugfs(void) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static DEFINE_MUTEX(graph_lock); + +int ftrace_graph_count; +unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) { - struct dentry *d_tracer; - struct dentry *entry; + unsigned long *array = m->private; + int index = *pos; - d_tracer = tracing_init_dentry(); + (*pos)++; + + if (index >= ftrace_graph_count) + return NULL; + + return &array[index]; +} + +static void *g_start(struct seq_file *m, loff_t *pos) +{ + void *p = NULL; + + mutex_lock(&graph_lock); + + p = g_next(m, p, pos); + + return p; +} + +static void g_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&graph_lock); +} + +static int g_show(struct seq_file *m, void *v) +{ + unsigned long *ptr = v; + char str[KSYM_SYMBOL_LEN]; + + if (!ptr) + return 0; + + kallsyms_lookup(*ptr, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations ftrace_graph_seq_ops = { + .start = g_start, + .next = g_next, + .stop = g_stop, + .show = g_show, +}; + +static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&graph_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) { + ftrace_graph_count = 0; + memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &ftrace_graph_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ftrace_graph_funcs; + } + } else + file->private_data = ftrace_graph_funcs; + mutex_unlock(&graph_lock); + + return ret; +} + +static ssize_t +ftrace_graph_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static int +ftrace_set_func(unsigned long *array, int idx, char *buffer) +{ + char str[KSYM_SYMBOL_LEN]; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int found = 0; + int i, j; + + if (ftrace_disabled) + return -ENODEV; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + continue; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (strcmp(str, buffer) == 0) { + found = 1; + for (j = 0; j < idx; j++) + if (array[j] == rec->ip) { + found = 0; + break; + } + if (found) + array[idx] = rec->ip; + break; + } + } + } + spin_unlock(&ftrace_lock); + + return found ? 0 : -EINVAL; +} + +static ssize_t +ftrace_graph_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned long *array; + size_t read = 0; + ssize_t ret; + int index = 0; + char ch; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&graph_lock); + + if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { + ret = -EBUSY; + goto out; + } + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + array = m->private; + } else + array = file->private_data; + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + while (cnt && !isspace(ch)) { + if (index < FTRACE_BUFF_MAX) + buffer[index++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + buffer[index] = 0; + + /* we allow only one at a time */ + ret = ftrace_set_func(array, ftrace_graph_count, buffer); + if (ret) + goto out; + + ftrace_graph_count++; + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&graph_lock); + + return ret; +} + +static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, + .read = ftrace_graph_read, + .write = ftrace_graph_write, +}; +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; entry = debugfs_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); @@ -1263,12 +1572,20 @@ static __init int ftrace_init_debugfs(void) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + NULL, + &ftrace_graph_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_graph_function' entry\n"); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + return 0; } -fs_initcall(ftrace_init_debugfs); - -static int ftrace_convert_nops(unsigned long *start, +static int ftrace_convert_nops(struct module *mod, + unsigned long *start, unsigned long *end) { unsigned long *p; @@ -1279,23 +1596,32 @@ static int ftrace_convert_nops(unsigned long *start, p = start; while (p < end) { addr = ftrace_call_adjust(*p++); + /* + * Some architecture linkers will pad between + * the different mcount_loc sections of different + * object files to satisfy alignments. + * Skip any NULL pointers. + */ + if (!addr) + continue; ftrace_record_ip(addr); } /* disable interrupts to prevent kstop machine */ local_irq_save(flags); - ftrace_update_code(); + ftrace_update_code(mod); local_irq_restore(flags); mutex_unlock(&ftrace_start_lock); return 0; } -void ftrace_init_module(unsigned long *start, unsigned long *end) +void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(start, end); + ftrace_convert_nops(mod, start, end); } extern unsigned long __start_mcount_loc[]; @@ -1325,7 +1651,8 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(__start_mcount_loc, + ret = ftrace_convert_nops(NULL, + __start_mcount_loc, __stop_mcount_loc); return; @@ -1342,12 +1669,186 @@ static int __init ftrace_nodyn_init(void) } device_initcall(ftrace_nodyn_init); -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) +static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline void ftrace_startup_enable(int command) { } +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(command) do { } while (0) +# define ftrace_shutdown(command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +static ssize_t +ftrace_pid_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + if (ftrace_pid_trace == ftrace_swapper_pid) + r = sprintf(buf, "swapper tasks\n"); + else if (ftrace_pid_trace) + r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); + else + r = sprintf(buf, "no pid\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void clear_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + clear_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void set_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + set_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void clear_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + do_each_pid_task(pid, PIDTYPE_PID, p) { + clear_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + put_pid(pid); +} + +static void set_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + do_each_pid_task(pid, PIDTYPE_PID, p) { + set_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); +} + +static void clear_ftrace_pid_task(struct pid **pid) +{ + if (*pid == ftrace_swapper_pid) + clear_ftrace_swapper(); + else + clear_ftrace_pid(*pid); + + *pid = NULL; +} + +static void set_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + set_ftrace_swapper(); + else + set_ftrace_pid(pid); +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct pid *pid; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtol(buf, 10, &val); + if (ret < 0) + return ret; + + mutex_lock(&ftrace_start_lock); + if (val < 0) { + /* disable pid tracing */ + if (!ftrace_pid_trace) + goto out; + + clear_ftrace_pid_task(&ftrace_pid_trace); + + } else { + /* swapper task is special */ + if (!val) { + pid = ftrace_swapper_pid; + if (pid == ftrace_pid_trace) + goto out; + } else { + pid = find_get_pid(val); + + if (pid == ftrace_pid_trace) { + put_pid(pid); + goto out; + } + } + + if (ftrace_pid_trace) + clear_ftrace_pid_task(&ftrace_pid_trace); + + if (!pid) + goto out; + + ftrace_pid_trace = pid; + + set_ftrace_pid_task(ftrace_pid_trace); + } + + /* update the function call */ + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + out: + mutex_unlock(&ftrace_start_lock); + + return cnt; +} + +static struct file_operations ftrace_pid_fops = { + .read = ftrace_pid_read, + .write = ftrace_pid_write, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + ftrace_init_dyn_debugfs(d_tracer); + + entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_pid' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + /** * ftrace_kill - kill ftrace * @@ -1381,10 +1882,11 @@ int register_ftrace_function(struct ftrace_ops *ops) return -1; mutex_lock(&ftrace_sysctl_lock); + ret = __register_ftrace_function(ops); - ftrace_startup(); - mutex_unlock(&ftrace_sysctl_lock); + ftrace_startup(0); + mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1400,7 +1902,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(); + ftrace_shutdown(0); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -1449,3 +1951,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, return ret; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static atomic_t ftrace_graph_active; + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + unsigned long flags; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + t->curr_ret_stack = -1; + /* Make sure IRQs see the -1 first: */ + barrier(); + t->ret_stack = ret_stack_list[start++]; + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + } + } while_each_thread(g, t); + +unlock: + read_unlock_irqrestore(&tasklist_lock, flags); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret; + + ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + kfree(ret_stack_list); + return ret; +} + +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + int ret = 0; + + mutex_lock(&ftrace_sysctl_lock); + + atomic_inc(&ftrace_graph_active); + ret = start_graph_tracing(); + if (ret) { + atomic_dec(&ftrace_graph_active); + goto out; + } + + ftrace_graph_return = retfunc; + ftrace_graph_entry = entryfunc; + + ftrace_startup(FTRACE_START_FUNC_RET); + +out: + mutex_unlock(&ftrace_sysctl_lock); + return ret; +} + +void unregister_ftrace_graph(void) +{ + mutex_lock(&ftrace_sysctl_lock); + + atomic_dec(&ftrace_graph_active); + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(FTRACE_STOP_FUNC_RET); + + mutex_unlock(&ftrace_sysctl_lock); +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + if (atomic_read(&ftrace_graph_active)) { + t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!t->ret_stack) + return; + t->curr_ret_stack = -1; + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + } else + t->ret_stack = NULL; +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} + +void ftrace_graph_stop(void) +{ + ftrace_stop(); +} +#endif + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 668bbb5ef2b..7f69cfeaadf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -18,8 +18,46 @@ #include "trace.h" -/* Global flag to disable all recording to ring buffers */ -static int ring_buffers_off __read_mostly; +/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. + * Turning this switch on, makes it OK to write to the + * ring buffer, if the ring buffer is enabled itself. + * + * There's three layers that must be on in order to write + * to the ring buffer. + * + * 1) This global flag must be set. + * 2) The ring buffer must be enabled for recording. + * 3) The per cpu buffer must be enabled for recording. + * + * In case of an anomaly, this global flag has a bit set that + * will permantly disable all ring buffers. + */ + +/* + * Global flag to disable all recording to ring buffers + * This has two bits: ON, DISABLED + * + * ON DISABLED + * ---- ---------- + * 0 0 : ring buffers are off + * 1 0 : ring buffers are on + * X 1 : ring buffers are permanently disabled + */ + +enum { + RB_BUFFERS_ON_BIT = 0, + RB_BUFFERS_DISABLED_BIT = 1, +}; + +enum { + RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, + RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, +}; + +static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; /** * tracing_on - enable all tracing buffers @@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly; */ void tracing_on(void) { - ring_buffers_off = 0; + set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); } /** @@ -42,9 +80,22 @@ void tracing_on(void) */ void tracing_off(void) { - ring_buffers_off = 1; + clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); } +/** + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers + * permanenty. + */ +void tracing_off_permanent(void) +{ + set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); +} + +#include "trace.h" + /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -144,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) -/* - * This hack stolen from mm/slob.c. - * We can store per page timing information in the page frame of the page. - * Thanks to Peter Zijlstra for suggesting this idea. - */ -struct buffer_page { +struct buffer_data_page { u64 time_stamp; /* page time stamp */ - local_t write; /* index for next write */ local_t commit; /* write commited index */ + unsigned char data[]; /* data of buffer page */ +}; + +struct buffer_page { + local_t write; /* index for next write */ unsigned read; /* index for next read */ struct list_head list; /* list of free pages */ - void *page; /* Actual data page */ + struct buffer_data_page *page; /* Actual data page */ }; +static void rb_init_page(struct buffer_data_page *bpage) +{ + local_set(&bpage->commit, 0); +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -179,7 +234,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE PAGE_SIZE +#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) /* * head_page == tail_page && head == tail then buffer is empty. @@ -187,7 +242,8 @@ static inline int test_time_stamp(u64 delta) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t lock; + spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head pages; struct buffer_page *head_page; /* read from head */ @@ -221,32 +277,16 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - } while (0) - -#define RB_WARN_ON_RET(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return -1; \ - } \ - } while (0) - -#define RB_WARN_ON_ONCE(buffer, cond) \ - do { \ - static int once; \ - if (unlikely(cond) && !once) { \ - once++; \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ atomic_inc(&buffer->record_disabled); \ WARN_ON(1); \ } \ - } while (0) + _____ret; \ + }) /** * check_pages - integrity check of buffer pages @@ -258,16 +298,20 @@ struct ring_buffer_iter { static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; - RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); - RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) + return -1; + if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) + return -1; - list_for_each_entry_safe(page, tmp, head, list) { - RB_WARN_ON_RET(cpu_buffer, - page->list.next->prev != &page->list); - RB_WARN_ON_RET(cpu_buffer, - page->list.prev->next != &page->list); + list_for_each_entry_safe(bpage, tmp, head, list) { + if (RB_WARN_ON(cpu_buffer, + bpage->list.next->prev != &bpage->list)) + return -1; + if (RB_WARN_ON(cpu_buffer, + bpage->list.prev->next != &bpage->list)) + return -1; } return 0; @@ -277,22 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; for (i = 0; i < nr_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; + bpage->page = (void *)addr; + rb_init_page(bpage->page); } list_splice(&pages, head); @@ -302,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return 0; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } return -ENOMEM; } @@ -313,7 +358,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_page *page; + struct buffer_page *bpage; unsigned long addr; int ret; @@ -324,19 +369,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->lock); + spin_lock_init(&cpu_buffer->reader_lock); + cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto fail_free_buffer; - cpu_buffer->reader_page = page; + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) goto fail_free_reader; - page->page = (void *)addr; + bpage->page = (void *)addr; + rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -361,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; list_del_init(&cpu_buffer->reader_page->list); free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(page, tmp, head, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } kfree(cpu_buffer); } @@ -465,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -473,13 +520,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - BUG_ON(list_empty(&cpu_buffer->pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + return; p = cpu_buffer->pages.next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - free_buffer_page(page); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + free_buffer_page(bpage); } - BUG_ON(list_empty(&cpu_buffer->pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + return; rb_reset_cpu(cpu_buffer); @@ -493,7 +542,7 @@ static void rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *pages, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -501,11 +550,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, synchronize_sched(); for (i = 0; i < nr_pages; i++) { - BUG_ON(list_empty(pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(pages))) + return; p = pages->next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - list_add_tail(&page->list, &cpu_buffer->pages); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + list_add_tail(&bpage->list, &cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -532,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) { struct ring_buffer_per_cpu *cpu_buffer; unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long buffer_size; unsigned long addr; LIST_HEAD(pages); @@ -562,7 +612,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size < buffer_size) { /* easy case, just free pages */ - BUG_ON(nr_pages >= buffer->pages); + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { + mutex_unlock(&buffer->mutex); + return -1; + } rm_pages = buffer->pages - nr_pages; @@ -581,21 +634,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - BUG_ON(nr_pages <= buffer->pages); + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { + mutex_unlock(&buffer->mutex); + return -1; + } + new_pages = nr_pages - buffer->pages; for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; + bpage->page = (void *)addr; + rb_init_page(bpage->page); } } @@ -604,7 +662,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - BUG_ON(!list_empty(&pages)); + if (RB_WARN_ON(buffer, !list_empty(&pages))) { + mutex_unlock(&buffer->mutex); + return -1; + } out: buffer->pages = nr_pages; @@ -613,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } mutex_unlock(&buffer->mutex); return -ENOMEM; @@ -626,9 +687,15 @@ static inline int rb_null_event(struct ring_buffer_event *event) return event->type == RINGBUF_TYPE_PADDING; } -static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +static inline void * +__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) +{ + return bpage->data + index; +} + +static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { - return page->page + index; + return bpage->page->data + index; } static inline struct ring_buffer_event * @@ -658,7 +725,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage) static inline unsigned rb_page_commit(struct buffer_page *bpage) { - return local_read(&bpage->commit); + return local_read(&bpage->page->commit); } /* Size is determined by what has been commited */ @@ -693,7 +760,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) head += rb_event_length(event)) { event = __rb_page_index(cpu_buffer->head_page, head); - BUG_ON(rb_null_event(event)); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; /* Only count data entries */ if (event->type != RINGBUF_TYPE_DATA) continue; @@ -703,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) } static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **page) + struct buffer_page **bpage) { - struct list_head *p = (*page)->list.next; + struct list_head *p = (*bpage)->list.next; if (p == &cpu_buffer->pages) p = p->next; - *page = list_entry(p, struct buffer_page, list); + *bpage = list_entry(p, struct buffer_page, list); } static inline unsigned @@ -746,16 +814,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, addr &= PAGE_MASK; while (cpu_buffer->commit_page->page != (void *)addr) { - RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page); - cpu_buffer->commit_page->commit = + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->commit_page == cpu_buffer->tail_page)) + return; + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; } /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->commit, index); + local_set(&cpu_buffer->commit_page->page->commit, index); } static inline void @@ -770,16 +840,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; barrier(); } @@ -787,7 +858,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; cpu_buffer->reader_page->read = 0; } @@ -806,7 +877,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(cpu_buffer, &iter->head_page); - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; } @@ -894,7 +965,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (write > BUF_PAGE_SIZE) { struct buffer_page *next_page = tail_page; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); rb_inc_page(cpu_buffer, &next_page); @@ -902,7 +974,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, reader_page = cpu_buffer->reader_page; /* we grabbed the lock before incrementing */ - RB_WARN_ON(cpu_buffer, next_page == reader_page); + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_unlock; /* * If for some reason, we had an interrupt storm that made @@ -940,12 +1013,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); - local_set(&next_page->commit, 0); + local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; /* reread the time stamp */ *ts = ring_buffer_time_stamp(cpu_buffer->cpu); - cpu_buffer->tail_page->time_stamp = *ts; + cpu_buffer->tail_page->page->time_stamp = *ts; } /* @@ -970,7 +1043,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_set_commit_to_write(cpu_buffer); } - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -978,7 +1052,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - BUG_ON(write > BUF_PAGE_SIZE); + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); @@ -988,12 +1063,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * this page's time stamp. */ if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; return event; out_unlock: - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return NULL; } @@ -1038,7 +1114,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; event->time_delta = 0; event->array[0] = 0; } @@ -1076,10 +1152,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * storm or we have something buggy. * Bail! */ - if (unlikely(++nr_loops > 1000)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - } ts = ring_buffer_time_stamp(cpu_buffer->cpu); @@ -1175,15 +1249,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; int cpu, resched; - if (ring_buffers_off) + if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; if (atomic_read(&buffer->record_disabled)) return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1214,10 +1287,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return event; out: - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return NULL; } @@ -1259,12 +1329,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, /* * Only the last preempt count needs to restore preemption. */ - if (preempt_count() == 1) { - if (per_cpu(rb_need_resched, cpu)) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); - } else + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else preempt_enable_no_resched_notrace(); return 0; @@ -1294,14 +1361,13 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu, resched; - if (ring_buffers_off) + if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; if (atomic_read(&buffer->record_disabled)) return -EBUSY; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1327,10 +1393,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return ret; } @@ -1489,14 +1552,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) return overruns; } -/** - * ring_buffer_iter_reset - reset an iterator - * @iter: The iterator to reset - * - * Resets the iterator, so that it will start from the beginning - * again. - */ -void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -1511,7 +1567,24 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -1597,7 +1670,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) unsigned long flags; int nr_loops = 0; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); again: /* @@ -1606,8 +1680,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ - if (unlikely(++nr_loops > 3)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } @@ -1619,8 +1692,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* Never should we have an index greater than the size */ - RB_WARN_ON(cpu_buffer, - cpu_buffer->reader_page->read > rb_page_size(reader)); + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader))) + goto out; /* check if we caught up to the tail */ reader = NULL; @@ -1637,7 +1711,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ reader->list.prev->next = &cpu_buffer->reader_page->list; @@ -1659,7 +1733,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return reader; } @@ -1673,7 +1748,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ - BUG_ON(!reader); + if (RB_WARN_ON(cpu_buffer, !reader)) + return; event = rb_reader_event(cpu_buffer); @@ -1700,7 +1776,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - BUG_ON(iter->head_page == cpu_buffer->commit_page); + if (RB_WARN_ON(buffer, + iter->head_page == cpu_buffer->commit_page)) + return; rb_inc_iter(iter); return; } @@ -1713,8 +1791,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * This should not be called to advance the header if we are * at the tail of the buffer. */ - BUG_ON((iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer))); + if (RB_WARN_ON(cpu_buffer, + (iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer)))) + return; rb_update_iter_read_stamp(iter, event); @@ -1726,17 +1806,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) rb_advance_iter(iter); } -/** - * ring_buffer_peek - peek at the next event to be read - * @buffer: The ring buffer to read - * @cpu: The cpu to peak at - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not consume the data. - */ -struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +static struct ring_buffer_event * +rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -1757,10 +1828,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * can have. Nesting 10 deep of interrupts is clearly * an anomaly. */ - if (unlikely(++nr_loops > 10)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; - } reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -1798,16 +1867,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) return NULL; } -/** - * ring_buffer_iter_peek - peek at the next event to be read - * @iter: The ring buffer iterator - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not increment the iterator. - */ -struct ring_buffer_event * -ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +static struct ring_buffer_event * +rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; @@ -1829,10 +1890,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * can have. Nesting 10 deep of interrupts is clearly * an anomaly. */ - if (unlikely(++nr_loops > 10)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; - } if (rb_per_cpu_empty(cpu_buffer)) return NULL; @@ -1869,6 +1928,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } /** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_buffer_peek(buffer, cpu, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_event *event; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} + +/** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @@ -1879,19 +1983,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event * ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; + unsigned long flags; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; - event = ring_buffer_peek(buffer, cpu, ts); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + event = rb_buffer_peek(buffer, cpu, ts); if (!event) - return NULL; + goto out; - cpu_buffer = buffer->buffers[cpu]; rb_advance_reader(cpu_buffer); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + return event; } @@ -1928,9 +2037,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - spin_lock_irqsave(&cpu_buffer->lock, flags); - ring_buffer_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + __raw_spin_lock(&cpu_buffer->lock); + rb_iter_reset(iter); + __raw_spin_unlock(&cpu_buffer->lock); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; } @@ -1962,12 +2073,17 @@ struct ring_buffer_event * ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; - event = ring_buffer_iter_peek(iter, ts); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); if (!event) - return NULL; + goto out; rb_advance_iter(iter); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -1987,7 +2103,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->commit, 0); + local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -1996,7 +2112,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; cpu_buffer->overrun = 0; @@ -2016,11 +2132,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpu_isset(cpu, buffer->cpumask)) return; - spin_lock_irqsave(&cpu_buffer->lock, flags); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + __raw_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -2118,16 +2238,178 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_data_page *bpage) +{ + struct ring_buffer_event *event; + unsigned long head; + + __raw_spin_lock(&cpu_buffer->lock); + for (head = 0; head < local_read(&bpage->commit); + head += rb_event_length(event)) { + + event = __rb_data_page_index(bpage, head); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->entries--; + } + __raw_spin_unlock(&cpu_buffer->lock); +} + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or NULL on error. + */ +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +{ + unsigned long addr; + struct buffer_data_page *bpage; + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + return NULL; + + bpage = (void *)addr; + + return bpage; +} + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +{ + free_page((unsigned long)data); +} + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_page(buffer); + * if (!rpage) + * return error; + * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * if (ret) + * process_page(rpage); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * 1 if data has been transferred + * 0 if no data has been transferred. + */ +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + unsigned long flags; + int ret = 0; + + if (!data_page) + return 0; + + bpage = *data_page; + if (!bpage) + return 0; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + /* + * rb_buffer_peek will get the next ring buffer if + * the current reader page is empty. + */ + event = rb_buffer_peek(buffer, cpu, NULL); + if (!event) + goto out; + + /* check for data */ + if (!local_read(&cpu_buffer->reader_page->page->commit)) + goto out; + /* + * If the writer is already off of the read page, then simply + * switch the read page with the given page. Otherwise + * we need to copy the data from the reader to the writer. + */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) { + unsigned int read = cpu_buffer->reader_page->read; + + if (full) + goto out; + /* The writer is still on the reader page, we must copy */ + bpage = cpu_buffer->reader_page->page; + memcpy(bpage->data, + cpu_buffer->reader_page->page->data + read, + local_read(&bpage->commit) - read); + + /* consume what was read */ + cpu_buffer->reader_page += read; + + } else { + /* swap the pages */ + rb_init_page(bpage); + bpage = cpu_buffer->reader_page->page; + cpu_buffer->reader_page->page = *data_page; + cpu_buffer->reader_page->read = 0; + *data_page = bpage; + } + ret = 1; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, bpage); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} + static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int *p = filp->private_data; + long *p = filp->private_data; char buf[64]; int r; - /* !ring_buffers_off == tracing_on */ - r = sprintf(buf, "%d\n", !*p); + if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) + r = sprintf(buf, "permanently disabled\n"); + else + r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2136,7 +2418,7 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - int *p = filp->private_data; + long *p = filp->private_data; char buf[64]; long val; int ret; @@ -2153,8 +2435,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - /* !ring_buffers_off == tracing_on */ - *p = !val; + if (val) + set_bit(RB_BUFFERS_ON_BIT, p); + else + clear_bit(RB_BUFFERS_ON_BIT, p); (*ppos)++; @@ -2176,7 +2460,7 @@ static __init int rb_init_debugfs(void) d_tracer = tracing_init_dentry(); entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffers_off, &rb_simple_fops); + &ring_buffer_flags, &rb_simple_fops); if (!entry) pr_warning("Could not create debugfs 'tracing_on' entry\n"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e3252f30..6adf660fc81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/fs.h> #include <linux/kprobes.h> +#include <linux/seq_file.h> #include <linux/writeback.h> #include <linux/stacktrace.h> @@ -43,6 +44,38 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +/* + * We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent + * insertions into the ring-buffer such as ftrace_printk could occurred + * at the same time, giving false positive or negative results. + */ +static bool __read_mostly tracing_selftest_running; + +/* For tracers that don't implement custom flags */ +static struct tracer_opt dummy_tracer_opt[] = { + { } +}; + +static struct tracer_flags dummy_tracer_flags = { + .val = 0, + .opts = dummy_tracer_opt +}; + +static int dummy_set_flag(u32 old_flags, u32 bit, int set) +{ + return 0; +} + +/* + * Kill all tracing for good (never come back). + * It is initialized to 1 but will turn to zero if the initialization + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +int tracing_disabled = 1; + static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) @@ -62,7 +95,36 @@ static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -static int tracing_disabled = 1; +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops to true. + */ +int ftrace_dump_on_oops; + +static int tracing_set_tracer(char *buf); + +static int __init set_ftrace(char *str) +{ + tracing_set_tracer(str); + return 1; +} +__setup("ftrace", set_ftrace); + +static int __init set_ftrace_dump_on_oops(char *str) +{ + ftrace_dump_on_oops = 1; + return 1; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); long ns2usecs(cycle_t nsec) @@ -112,6 +174,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; +/** + * tracing_is_enabled - return tracer_enabled status + * + * This function is used by other tracers to know the status + * of the tracer_enabled flag. Tracers may use this function + * to know if it should enable their features when starting + * up. See irqsoff tracer for an example (start_irqsoff_tracer). + */ +int tracing_is_enabled(void) +{ + return tracer_enabled; +} + /* function tracing enabled */ int ftrace_function_enabled; @@ -153,8 +228,9 @@ static DEFINE_MUTEX(trace_types_lock); /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -/* trace_flags holds iter_ctrl options */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +/* trace_flags holds trace_options default values */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | + TRACE_ITER_ANNOTATE; /** * trace_wake_up - wake up tasks waiting for trace input @@ -193,13 +269,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -/* - * TRACE_ITER_SYM_MASK masks the options in trace_flags that - * control the output of kernel symbols. - */ -#define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) - /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { "print-parent", @@ -213,6 +282,11 @@ static const char *trace_options[] = { "stacktrace", "sched-tree", "ftrace_printk", + "ftrace_preempt", + "branch", + "annotate", + "userstacktrace", + "sym-userobj", NULL }; @@ -359,6 +433,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } +static int +trace_seq_path(struct trace_seq *s, struct path *path) +{ + unsigned char *p; + + if (s->len >= (PAGE_SIZE - 1)) + return 0; + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + return 0; +} + static void trace_seq_reset(struct trace_seq *s) { @@ -470,7 +566,17 @@ int register_tracer(struct tracer *type) return -1; } + /* + * When this gets called we hold the BKL which means that + * preemption is disabled. Various trace selftests however + * need to disable and enable preemption for successful tests. + * So we drop the BKL here and grab it after the tests again. + */ + unlock_kernel(); mutex_lock(&trace_types_lock); + + tracing_selftest_running = true; + for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -481,12 +587,20 @@ int register_tracer(struct tracer *type) } } + if (!type->set_flag) + type->set_flag = &dummy_set_flag; + if (!type->flags) + type->flags = &dummy_tracer_flags; + else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; + #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; - int saved_ctrl = tr->ctrl; int i; + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current @@ -494,25 +608,23 @@ int register_tracer(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_tracing_cpu(i) { + for_each_tracing_cpu(i) tracing_reset(tr, i); - } + current_trace = type; - tr->ctrl = 0; /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ current_trace = saved_tracer; - tr->ctrl = saved_ctrl; if (ret) { printk(KERN_CONT "FAILED!\n"); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) { + for_each_tracing_cpu(i) tracing_reset(tr, i); - } + printk(KERN_CONT "PASSED\n"); } #endif @@ -524,7 +636,9 @@ int register_tracer(struct tracer *type) max_tracer_type_len = len; out: + tracing_selftest_running = false; mutex_unlock(&trace_types_lock); + lock_kernel(); return ret; } @@ -581,6 +695,91 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +static int trace_stop_count; +static DEFINE_SPINLOCK(tracing_start_lock); + +/** + * ftrace_off_permanent - disable all ftrace code permanently + * + * This should only be called when a serious anomally has + * been detected. This will turn off the function tracing, + * ring buffers, and other tracing utilites. It takes no + * locks and can be called from any context. + */ +void ftrace_off_permanent(void) +{ + tracing_disabled = 1; + ftrace_stop(); + tracing_off_permanent(); +} + +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + spin_lock_irqsave(&tracing_start_lock, flags); + if (--trace_stop_count) + goto out; + + if (trace_stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + trace_stop_count = 0; + goto out; + } + + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + ftrace_start(); + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + ftrace_stop(); + spin_lock_irqsave(&tracing_start_lock, flags); + if (trace_stop_count++) + goto out; + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + void trace_stop_cmdline_recording(void); static void trace_save_cmdline(struct task_struct *tsk) @@ -618,7 +817,7 @@ static void trace_save_cmdline(struct task_struct *tsk) spin_unlock(&trace_cmdline_lock); } -static char *trace_find_cmdline(int pid) +char *trace_find_cmdline(int pid) { char *cmdline = "<...>"; unsigned map; @@ -655,6 +854,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->tgid = (tsk) ? tsk->tgid : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -691,6 +891,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void __trace_graph_entry(struct trace_array *tr, + struct trace_array_cpu *data, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; + unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_GRAPH_ENT; + entry->graph_ent = *trace; + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); +} + +static void __trace_graph_return(struct trace_array *tr, + struct trace_array_cpu *data, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *entry; + unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_GRAPH_RET; + entry->ret = *trace; + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); +} +#endif + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -742,6 +992,46 @@ void __trace_stack(struct trace_array *tr, ftrace_trace_stack(tr, data, flags, skip, preempt_count()); } +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, int pc) +{ +#ifdef CONFIG_STACKTRACE + struct ring_buffer_event *event; + struct userstack_entry *entry; + struct stack_trace trace; + unsigned long irq_flags; + + if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_USER_STACK; + + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = 0; + trace.entries = entry->caller; + + save_stack_trace_user(&trace); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +#endif +} + +void __trace_userstack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags) +{ + ftrace_trace_userstack(tr, data, flags, preempt_count()); +} + static void ftrace_trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3, @@ -765,6 +1055,7 @@ ftrace_trace_special(void *__tr, void *__data, entry->arg3 = arg3; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, irq_flags, 4, pc); + ftrace_trace_userstack(tr, data, irq_flags, pc); trace_wake_up(); } @@ -803,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, flags, 5, pc); + ftrace_trace_userstack(tr, data, flags, pc); } void @@ -832,6 +1124,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, flags, 6, pc); + ftrace_trace_userstack(tr, data, flags, pc); trace_wake_up(); } @@ -841,26 +1134,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; + unsigned long flags; int cpu; int pc; - if (tracing_disabled || !tr->ctrl) + if (tracing_disabled) return; pc = preempt_count(); - preempt_disable_notrace(); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (likely(!atomic_read(&data->disabled))) + if (likely(atomic_inc_return(&data->disabled) == 1)) ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); - preempt_enable_notrace(); + atomic_dec(&data->disabled); + local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_TRACER static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -873,8 +1168,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -884,11 +1178,96 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); +} + +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (!ftrace_trace_task(current)) + return 0; + + if (!ftrace_graph_addr(trace->func)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_entry(tr, data, trace, flags, pc); + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return 1; +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, data, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static struct ftrace_ops trace_ops __read_mostly = { @@ -898,9 +1277,14 @@ static struct ftrace_ops trace_ops __read_mostly = void tracing_start_function_trace(void) { ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + register_ftrace_function(&trace_ops); - if (tracer_enabled) - ftrace_function_enabled = 1; + ftrace_function_enabled = 1; } void tracing_stop_function_trace(void) @@ -912,6 +1296,7 @@ void tracing_stop_function_trace(void) enum trace_file_type { TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, }; static void trace_iterator_increment(struct trace_iterator *iter, int cpu) @@ -1047,10 +1432,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) atomic_inc(&trace_record_cmdline_disabled); - /* let the tracer grab locks here if needed */ - if (current_trace->start) - current_trace->start(iter); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -1077,14 +1458,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { - struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); - - /* let the tracer release locks here if needed */ - if (current_trace && current_trace == iter->trace && iter->trace->stop) - iter->trace->stop(iter); - mutex_unlock(&trace_types_lock); } @@ -1143,7 +1517,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static int +int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { int ret; @@ -1164,6 +1538,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } +static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (mm) { + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); + } + up_read(&mm->mmap_sem); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +static int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + struct mm_struct *mm = NULL; + int ret = 1; + unsigned int i; + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->ent.tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || !ret) + break; + if (i && ret) + ret = trace_seq_puts(s, " <- "); + if (!ip) { + if (ret) + ret = trace_seq_puts(s, "??"); + continue; + } + if (!ret) + break; + if (ret) + ret = seq_print_user_ip(s, mm, ip, sym_flags); + } + + if (mm) + mmput(mm); + return ret; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1338,6 +1784,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) trace_seq_putc(s, '\n'); } +static void test_cpu_buff_start(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + + if (!(trace_flags & TRACE_ITER_ANNOTATE)) + return; + + if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) + return; + + if (cpu_isset(iter->cpu, iter->started)) + return; + + cpu_set(iter->cpu, iter->started); + trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); +} + static enum print_line_t print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { @@ -1357,6 +1820,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; + test_cpu_buff_start(iter); + next_entry = find_next_entry(iter, NULL, &next_ts); if (!next_entry) next_ts = iter->ts; @@ -1448,6 +1913,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_seq_print_cont(s, iter); break; } + case TRACE_BRANCH: { + struct trace_branch *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + break; + } + case TRACE_USER_STACK: { + struct userstack_entry *field; + + trace_assign_type(field, entry); + + seq_print_userip_objs(field, s, sym_flags); + trace_seq_putc(s, '\n'); + break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1472,6 +1958,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; + test_cpu_buff_start(iter); + comm = trace_find_cmdline(iter->ent->pid); t = ns2usecs(iter->ts); @@ -1581,6 +2069,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_seq_print_cont(s, iter); break; } + case TRACE_GRAPH_RET: { + return print_graph_function(iter); + } + case TRACE_GRAPH_ENT: { + return print_graph_function(iter); + } + case TRACE_BRANCH: { + struct trace_branch *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + break; + } + case TRACE_USER_STACK: { + struct userstack_entry *field; + + trace_assign_type(field, entry); + + ret = seq_print_userip_objs(field, s, sym_flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + break; + } } return TRACE_TYPE_HANDLED; } @@ -1640,6 +2159,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -1728,6 +2248,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -1782,6 +2303,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -1847,7 +2369,9 @@ static int s_show(struct seq_file *m, void *v) seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); } - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + if (iter->trace && iter->trace->print_header) + iter->trace->print_header(m); + else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return 0; @@ -1899,6 +2423,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->tr->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = @@ -1917,13 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) m->private = iter; /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } - - if (iter->trace && iter->trace->open) - iter->trace->open(iter); + tracing_stop(); mutex_unlock(&trace_types_lock); @@ -1966,14 +2493,7 @@ int tracing_release(struct inode *inode, struct file *file) iter->trace->close(iter); /* reenable tracing if it was previously enabled */ - if (iter->tr->ctrl) { - tracer_enabled = 1; - /* - * It is safe to enable function tracing even if it - * isn't used - */ - ftrace_function_enabled = 1; - } + tracing_start(); mutex_unlock(&trace_types_lock); seq_release(inode, file); @@ -2126,7 +2646,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2147,11 +2667,11 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); if (err) goto err_unlock; - raw_local_irq_disable(); + local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* @@ -2168,7 +2688,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, } } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_enable(); + local_irq_enable(); tracing_cpumask = tracing_cpumask_new; @@ -2189,13 +2709,16 @@ static struct file_operations tracing_cpumask_fops = { }; static ssize_t -tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, +tracing_trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + int i; char *buf; int r = 0; int len = 0; - int i; + u32 tracer_flags = current_trace->flags->val; + struct tracer_opt *trace_opts = current_trace->flags->opts; + /* calulate max size */ for (i = 0; trace_options[i]; i++) { @@ -2203,6 +2726,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and space */ } + /* + * Increase the size with names of options specific + * of the current tracer. + */ + for (i = 0; trace_opts[i].name; i++) { + len += strlen(trace_opts[i].name); + len += 3; /* "no" and space */ + } + /* +2 for \n and \0 */ buf = kmalloc(len + 2, GFP_KERNEL); if (!buf) @@ -2215,6 +2747,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, r += sprintf(buf + r, "no%s ", trace_options[i]); } + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) + r += sprintf(buf + r, "%s ", + trace_opts[i].name); + else + r += sprintf(buf + r, "no%s ", + trace_opts[i].name); + } + r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); @@ -2225,13 +2766,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, return r; } +/* Try to assign a tracer specific option */ +static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +{ + struct tracer_flags *trace_flags = trace->flags; + struct tracer_opt *opts = NULL; + int ret = 0, i = 0; + int len; + + for (i = 0; trace_flags->opts[i].name; i++) { + opts = &trace_flags->opts[i]; + len = strlen(opts->name); + + if (strncmp(cmp, opts->name, len) == 0) { + ret = trace->set_flag(trace_flags->val, + opts->bit, !neg); + break; + } + } + /* Not found */ + if (!trace_flags->opts[i].name) + return -EINVAL; + + /* Refused to handle */ + if (ret) + return ret; + + if (neg) + trace_flags->val &= ~opts->bit; + else + trace_flags->val |= opts->bit; + + return 0; +} + static ssize_t -tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, +tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; char *cmp = buf; int neg = 0; + int ret; int i; if (cnt >= sizeof(buf)) @@ -2258,11 +2834,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, break; } } - /* - * If no option could be set, return an error: - */ - if (!trace_options[i]) - return -EINVAL; + + /* If no option could be set, test the specific tracer options */ + if (!trace_options[i]) { + ret = set_tracer_option(current_trace, cmp, neg); + if (ret) + return ret; + } filp->f_pos += cnt; @@ -2271,8 +2849,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, static struct file_operations tracing_iter_fops = { .open = tracing_open_generic, - .read = tracing_iter_ctrl_read, - .write = tracing_iter_ctrl_write, + .read = tracing_trace_options_read, + .write = tracing_trace_options_write, }; static const char readme_msg[] = @@ -2286,9 +2864,9 @@ static const char readme_msg[] = "# echo sched_switch > /debug/tracing/current_tracer\n" "# cat /debug/tracing/current_tracer\n" "sched_switch\n" - "# cat /debug/tracing/iter_ctrl\n" + "# cat /debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" "echo 0 > /debug/tracing/tracing_enabled\n" @@ -2311,11 +2889,10 @@ static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; char buf[64]; int r; - r = sprintf(buf, "%ld\n", tr->ctrl); + r = sprintf(buf, "%u\n", tracer_enabled); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2343,16 +2920,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, val = !!val; mutex_lock(&trace_types_lock); - if (tr->ctrl ^ val) { - if (val) + if (tracer_enabled ^ val) { + if (val) { tracer_enabled = 1; - else + if (current_trace->start) + current_trace->start(tr); + tracing_start(); + } else { tracer_enabled = 0; - - tr->ctrl = val; - - if (current_trace && current_trace->ctrl_update) - current_trace->ctrl_update(tr); + tracing_stop(); + if (current_trace->stop) + current_trace->stop(tr); + } } mutex_unlock(&trace_types_lock); @@ -2378,29 +2957,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_set_trace_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_set_tracer(char *buf) { struct trace_array *tr = &global_trace; struct tracer *t; - char buf[max_tracer_type_len+1]; - int i; - size_t ret; - - ret = cnt; - - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + int ret = 0; mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { @@ -2414,18 +2975,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (t == current_trace) goto out; + trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); current_trace = t; - if (t->init) - t->init(tr); + if (t->init) { + ret = t->init(tr); + if (ret) + goto out; + } + trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); - if (ret > 0) - filp->f_pos += ret; + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+1]; + int i; + size_t ret; + int err; + + ret = cnt; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + err = tracing_set_tracer(buf); + if (err) + return err; + + filp->f_pos += ret; return ret; } @@ -2492,6 +3087,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return -ENOMEM; mutex_lock(&trace_types_lock); + + /* trace pipe does not show start of buffer */ + cpus_setall(iter->started); + iter->tr = &global_trace; iter->trace = current_trace; filp->private_data = iter; @@ -2667,7 +3266,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, char buf[64]; int r; - r = sprintf(buf, "%lu\n", tr->entries); + r = sprintf(buf, "%lu\n", tr->entries >> 10); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2678,7 +3277,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, unsigned long val; char buf[64]; int ret, cpu; - struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2698,12 +3296,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); - if (tr->ctrl) { - cnt = -EBUSY; - pr_info("ftrace: please disable tracing" - " before modifying buffer size\n"); - goto out; - } + tracing_stop(); /* disable all cpu buffers */ for_each_tracing_cpu(cpu) { @@ -2713,6 +3306,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, atomic_inc(&max_tr.data[cpu]->disabled); } + /* value is in KB */ + val <<= 10; + if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); if (ret < 0) { @@ -2751,6 +3347,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, atomic_dec(&max_tr.data[cpu]->disabled); } + tracing_start(); max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); @@ -2762,7 +3359,7 @@ static int mark_printk(const char *fmt, ...) int ret; va_list args; va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); + ret = trace_vprintk(0, -1, fmt, args); va_end(args); return ret; } @@ -2773,9 +3370,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, { char *buf; char *end; - struct trace_array *tr = &global_trace; - if (!tr->ctrl || tracing_disabled) + if (tracing_disabled) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -2841,22 +3437,38 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE +int __weak ftrace_arch_read_dyn_info(char *buf, int size) +{ + return 0; +} + static ssize_t -tracing_read_long(struct file *filp, char __user *ubuf, +tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; - char buf[64]; + char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); int r; - r = sprintf(buf, "%ld\n", *p); + mutex_lock(&dyn_info_mutex); + r = sprintf(buf, "%ld ", *p); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + buf[r++] = '\n'; + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + mutex_unlock(&dyn_info_mutex); + + return r; } -static struct file_operations tracing_read_long_fops = { +static struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, - .read = tracing_read_long, + .read = tracing_read_dyn_info, }; #endif @@ -2897,10 +3509,10 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); - entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + entry = debugfs_create_file("trace_options", 0644, d_tracer, NULL, &tracing_iter_fops); if (!entry) - pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + pr_warning("Could not create debugfs 'trace_options' entry\n"); entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, NULL, &tracing_cpumask_fops); @@ -2950,11 +3562,11 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_pipe' entry\n"); - entry = debugfs_create_file("trace_entries", 0644, d_tracer, + entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); if (!entry) pr_warning("Could not create debugfs " - "'trace_entries' entry\n"); + "'buffer_size_kb' entry\n"); entry = debugfs_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); @@ -2965,7 +3577,7 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, - &tracing_read_long_fops); + &tracing_dyn_info_fops); if (!entry) pr_warning("Could not create debugfs " "'dyn_ftrace_total_info' entry\n"); @@ -2976,7 +3588,7 @@ static __init int tracer_init_debugfs(void) return 0; } -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { static DEFINE_SPINLOCK(trace_buf_lock); static char trace_buf[TRACE_BUF_SIZE]; @@ -2984,11 +3596,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct print_entry *entry; - unsigned long flags, irq_flags; int cpu, len = 0, size, pc; + struct print_entry *entry; + unsigned long irq_flags; - if (!tr->ctrl || tracing_disabled) + if (tracing_disabled || tracing_selftest_running) return 0; pc = preempt_count(); @@ -2999,7 +3611,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (unlikely(atomic_read(&data->disabled))) goto out; - spin_lock_irqsave(&trace_buf_lock, flags); + pause_graph_tracing(); + spin_lock_irqsave(&trace_buf_lock, irq_flags); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -3010,17 +3623,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); + tracing_generic_entry_update(&entry->ent, irq_flags, pc); entry->ent.type = TRACE_PRINT; entry->ip = ip; + entry->depth = depth; memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, flags); - + spin_unlock_irqrestore(&trace_buf_lock, irq_flags); + unpause_graph_tracing(); out: preempt_enable_notrace(); @@ -3037,7 +3651,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vprintk(ip, fmt, ap); + ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); va_end(ap); return ret; } @@ -3046,7 +3660,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk); static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); return NOTIFY_OK; } @@ -3062,7 +3677,8 @@ static int trace_die_handler(struct notifier_block *self, { switch (val) { case DIE_OOPS: - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); break; default: break; @@ -3103,7 +3719,6 @@ trace_printk_seq(struct trace_seq *s) trace_seq_reset(s); } - void ftrace_dump(void) { static DEFINE_SPINLOCK(ftrace_dump_lock); @@ -3128,6 +3743,9 @@ void ftrace_dump(void) atomic_inc(&global_trace.data[cpu]->disabled); } + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + printk(KERN_TRACE "Dumping ftrace buffer:\n"); iter.tr = &global_trace; @@ -3221,7 +3839,6 @@ __init static int tracer_alloc_buffers(void) #endif /* All seems OK, enable tracing */ - global_trace.ctrl = tracer_enabled; tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8465ad05270..5ac697065a4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -8,6 +8,7 @@ #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> #include <linux/ftrace.h> +#include <trace/boot.h> enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -21,7 +22,14 @@ enum trace_type { TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, - TRACE_BOOT, + TRACE_BRANCH, + TRACE_BOOT_CALL, + TRACE_BOOT_RET, + TRACE_GRAPH_RET, + TRACE_GRAPH_ENT, + TRACE_USER_STACK, + TRACE_BTS, + TRACE_POWER, __TRACE_LAST_TYPE }; @@ -38,6 +46,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + int tgid; }; /* @@ -48,6 +57,18 @@ struct ftrace_entry { unsigned long ip; unsigned long parent_ip; }; + +/* Function call entry */ +struct ftrace_graph_ent_entry { + struct trace_entry ent; + struct ftrace_graph_ent graph_ent; +}; + +/* Function return entry */ +struct ftrace_graph_ret_entry { + struct trace_entry ent; + struct ftrace_graph_ret ret; +}; extern struct tracer boot_tracer; /* @@ -85,12 +106,18 @@ struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; }; +struct userstack_entry { + struct trace_entry ent; + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + /* * ftrace_printk entry: */ struct print_entry { struct trace_entry ent; unsigned long ip; + int depth; char buf[]; }; @@ -112,9 +139,35 @@ struct trace_mmiotrace_map { struct mmiotrace_map map; }; -struct trace_boot { +struct trace_boot_call { struct trace_entry ent; - struct boot_trace initcall; + struct boot_trace_call boot_call; +}; + +struct trace_boot_ret { + struct trace_entry ent; + struct boot_trace_ret boot_ret; +}; + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 +struct trace_branch { + struct trace_entry ent; + unsigned line; + char func[TRACE_FUNC_SIZE+1]; + char file[TRACE_FILE_SIZE+1]; + char correct; +}; + +struct bts_entry { + struct trace_entry ent; + unsigned long from; + unsigned long to; +}; + +struct trace_power { + struct trace_entry ent; + struct power_trace state_data; }; /* @@ -172,7 +225,6 @@ struct trace_iterator; struct trace_array { struct ring_buffer *buffer; unsigned long entries; - long ctrl; int cpu; cycle_t time_start; struct task_struct *waiter; @@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ + IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ + IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ + IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ + TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ + IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ __ftrace_bad_type(); \ } while (0) @@ -229,29 +290,56 @@ enum print_line_t { TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ }; + +/* + * An option specific to a tracer. This is a boolean value. + * The bit is the bit index that sets its value on the + * flags value in struct tracer_flags. + */ +struct tracer_opt { + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ +}; + +/* + * The set of specific options for a tracer. Your tracer + * have to set the initial value of the flags val. + */ +struct tracer_flags { + u32 val; + struct tracer_opt *opts; +}; + +/* Makes more easy to define a tracer opt */ +#define TRACER_OPT(s, b) .name = #s, .bit = b + /* * A specific tracer, represented by methods that operate on a trace array: */ struct tracer { const char *name; - void (*init)(struct trace_array *tr); + /* Your tracer should raise a warning if init fails */ + int (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); - void (*start)(struct trace_iterator *iter); - void (*stop)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); - void (*ctrl_update)(struct trace_array *tr); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif + void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); + /* If you handled the flag setting, return 0 */ + int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; int print_max; + struct tracer_flags *flags; }; struct trace_seq { @@ -279,8 +367,11 @@ struct trace_iterator { unsigned long iter_flags; loff_t pos; long idx; + + cpumask_t started; }; +int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); int tracing_open_generic(struct inode *inode, struct file *filp); @@ -321,8 +412,17 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_return(struct ftrace_graph_ret *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); +void trace_bts(struct trace_array *tr, + unsigned long from, + unsigned long to); + void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_sched_switch_assign_trace(struct trace_array *tr); +void tracing_stop_sched_switch_record(void); +void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); @@ -358,6 +458,7 @@ struct tracer_switch_ops { struct tracer_switch_ops *next; }; +char *trace_find_cmdline(int pid); #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE @@ -383,19 +484,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_branch(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter); + +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); -extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); extern unsigned long trace_flags; +/* Standard output formatting function used for function return traces */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern enum print_line_t print_graph_function(struct trace_iterator *iter); + +#ifdef CONFIG_DYNAMIC_FTRACE +/* TODO: make this variable */ +#define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_count; +extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; + +static inline int ftrace_graph_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_count || test_tsk_trace_graph(current)) + return 1; + + for (i = 0; i < ftrace_graph_count; i++) { + if (addr == ftrace_graph_funcs[i]) + return 1; + } + + return 0; +} +#else +static inline int ftrace_trace_addr(unsigned long addr) +{ + return 1; +} +static inline int ftrace_graph_addr(unsigned long addr) +{ + return 1; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static inline enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +extern struct pid *ftrace_pid_trace; + +static inline int ftrace_trace_task(struct task_struct *task) +{ + if (!ftrace_pid_trace) + return 1; + + return test_tsk_trace_trace(task); +} + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -415,8 +576,92 @@ enum trace_iterator_flags { TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, + TRACE_ITER_PREEMPTONLY = 0x800, + TRACE_ITER_BRANCH = 0x1000, + TRACE_ITER_ANNOTATE = 0x2000, + TRACE_ITER_USERSTACKTRACE = 0x4000, + TRACE_ITER_SYM_USEROBJ = 0x8000 }; +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + extern struct tracer nop_trace; +/** + * ftrace_preempt_disable - disable preemption scheduler safe + * + * When tracing can happen inside the scheduler, there exists + * cases that the tracing might happen before the need_resched + * flag is checked. If this happens and the tracer calls + * preempt_enable (after a disable), a schedule might take place + * causing an infinite recursion. + * + * To prevent this, we read the need_recshed flag before + * disabling preemption. When we want to enable preemption we + * check the flag, if it is set, then we call preempt_enable_no_resched. + * Otherwise, we call preempt_enable. + * + * The rational for doing the above is that if need resched is set + * and we have yet to reschedule, we are either in an atomic location + * (where we do not need to check for scheduling) or we are inside + * the scheduler and do not want to resched. + */ +static inline int ftrace_preempt_disable(void) +{ + int resched; + + resched = need_resched(); + preempt_disable_notrace(); + + return resched; +} + +/** + * ftrace_preempt_enable - enable preemption scheduler safe + * @resched: the return value from ftrace_preempt_disable + * + * This is a scheduler safe way to enable preemption and not miss + * any preemption checks. The disabled saved the state of preemption. + * If resched is set, then we were either inside an atomic or + * are inside the scheduler (we would have already scheduled + * otherwise). In this case, we do not want to call normal + * preempt_enable, but preempt_enable_no_resched instead. + */ +static inline void ftrace_preempt_enable(int resched) +{ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +#ifdef CONFIG_BRANCH_TRACER +extern int enable_branch_tracing(struct trace_array *tr); +extern void disable_branch_tracing(void); +static inline int trace_branch_enable(struct trace_array *tr) +{ + if (trace_flags & TRACE_ITER_BRANCH) + return enable_branch_tracing(tr); + return 0; +} +static inline void trace_branch_disable(void) +{ + /* due to races, always disable */ + disable_branch_tracing(); +} +#else +static inline int trace_branch_enable(struct trace_array *tr) +{ + return 0; +} +static inline void trace_branch_disable(void) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d0a5e50eeff..a4fa2c57e34 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -13,73 +13,117 @@ #include "trace.h" static struct trace_array *boot_trace; -static int trace_boot_enabled; +static bool pre_initcalls_finished; - -/* Should be started after do_pre_smp_initcalls() in init/main.c */ +/* Tells the boot tracer that the pre_smp_initcalls are finished. + * So we are ready . + * It doesn't enable sched events tracing however. + * You have to call enable_boot_trace to do so. + */ void start_boot_trace(void) { - trace_boot_enabled = 1; + pre_initcalls_finished = true; } -void stop_boot_trace(void) +void enable_boot_trace(void) { - trace_boot_enabled = 0; + if (pre_initcalls_finished) + tracing_start_sched_switch_record(); } -void reset_boot_trace(struct trace_array *tr) +void disable_boot_trace(void) { - stop_boot_trace(); + if (pre_initcalls_finished) + tracing_stop_sched_switch_record(); } -static void boot_trace_init(struct trace_array *tr) +static void reset_boot_trace(struct trace_array *tr) { int cpu; - boot_trace = tr; - trace_boot_enabled = 0; + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + +static int boot_trace_init(struct trace_array *tr) +{ + int cpu; + boot_trace = tr; for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); + + tracing_sched_switch_assign_trace(tr); + return 0; } -static void boot_trace_ctrl_update(struct trace_array *tr) +static enum print_line_t +initcall_call_print_line(struct trace_iterator *iter) { - if (tr->ctrl) - start_boot_trace(); + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct trace_boot_call *field; + struct boot_trace_call *call; + u64 ts; + unsigned long nsec_rem; + int ret; + + trace_assign_type(field, entry); + call = &field->boot_call; + ts = iter->ts; + nsec_rem = do_div(ts, 1000000000); + + ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", + (unsigned long)ts, nsec_rem, call->func, call->caller); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; else - stop_boot_trace(); + return TRACE_TYPE_HANDLED; } -static enum print_line_t initcall_print_line(struct trace_iterator *iter) +static enum print_line_t +initcall_ret_print_line(struct trace_iterator *iter) { - int ret; struct trace_entry *entry = iter->ent; - struct trace_boot *field = (struct trace_boot *)entry; - struct boot_trace *it = &field->initcall; struct trace_seq *s = &iter->seq; - struct timespec calltime = ktime_to_timespec(it->calltime); - struct timespec rettime = ktime_to_timespec(it->rettime); - - if (entry->type == TRACE_BOOT) { - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - calltime.tv_sec, - calltime.tv_nsec, - it->func, it->caller); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %lld msecs\n", - rettime.tv_sec, - rettime.tv_nsec, - it->func, it->result, it->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + struct trace_boot_ret *field; + struct boot_trace_ret *init_ret; + u64 ts; + unsigned long nsec_rem; + int ret; + + trace_assign_type(field, entry); + init_ret = &field->boot_ret; + ts = iter->ts; + nsec_rem = do_div(ts, 1000000000); + + ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " + "returned %d after %llu msecs\n", + (unsigned long) ts, + nsec_rem, + init_ret->func, init_ret->result, init_ret->duration); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + else return TRACE_TYPE_HANDLED; +} + +static enum print_line_t initcall_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_BOOT_CALL: + return initcall_call_print_line(iter); + case TRACE_BOOT_RET: + return initcall_ret_print_line(iter); + default: + return TRACE_TYPE_UNHANDLED; } - return TRACE_TYPE_UNHANDLED; } struct tracer boot_tracer __read_mostly = @@ -87,27 +131,53 @@ struct tracer boot_tracer __read_mostly = .name = "initcall", .init = boot_trace_init, .reset = reset_boot_trace, - .ctrl_update = boot_trace_ctrl_update, .print_line = initcall_print_line, }; -void trace_boot(struct boot_trace *it, initcall_t fn) +void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { struct ring_buffer_event *event; - struct trace_boot *entry; - struct trace_array_cpu *data; + struct trace_boot_call *entry; unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!trace_boot_enabled) + if (!pre_initcalls_finished) return; /* Get its name now since this function could * disappear because it is in the .init section. */ - sprint_symbol(it->func, (unsigned long)fn); + sprint_symbol(bt->func, (unsigned long)fn); + preempt_disable(); + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_BOOT_CALL; + entry->boot_call = *bt; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} + +void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) +{ + struct ring_buffer_event *event; + struct trace_boot_ret *entry; + unsigned long irq_flags; + struct trace_array *tr = boot_trace; + + if (!pre_initcalls_finished) + return; + + sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - data = tr->data[smp_processor_id()]; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); @@ -115,8 +185,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_BOOT; - entry->initcall = *it; + entry->ent.type = TRACE_BOOT_RET; + entry->boot_ret = *bt; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c new file mode 100644 index 00000000000..6c00feb3bac --- /dev/null +++ b/kernel/trace/trace_branch.c @@ -0,0 +1,342 @@ +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/irqflags.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/hash.h> +#include <linux/fs.h> +#include <asm/local.h> +#include "trace.h" + +#ifdef CONFIG_BRANCH_TRACER + +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); +static struct trace_array *branch_tracer; + +static void +probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + struct trace_array *tr = branch_tracer; + struct ring_buffer_event *event; + struct trace_branch *entry; + unsigned long flags, irq_flags; + int cpu, pc; + const char *p; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + + pc = preempt_count(); + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_BRANCH; + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); + while (p >= f->file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->line = f->line; + entry->correct = val == expect; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + if (!branch_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_branch_tracing(struct trace_array *tr) +{ + int ret = 0; + + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + + return ret; +} + +void disable_branch_tracing(void) +{ + mutex_lock(&branch_tracing_mutex); + + if (!branch_tracing_enabled) + goto out_unlock; + + branch_tracing_enabled--; + + out_unlock: + mutex_unlock(&branch_tracing_mutex); +} + +static void start_branch_trace(struct trace_array *tr) +{ + enable_branch_tracing(tr); +} + +static void stop_branch_trace(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static int branch_trace_init(struct trace_array *tr) +{ + int cpu; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + start_branch_trace(tr); + return 0; +} + +static void branch_trace_reset(struct trace_array *tr) +{ + stop_branch_trace(tr); +} + +struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif +}; + +__init static int init_branch_trace(void) +{ + return register_tracer(&branch_trace); +} + +device_initcall(init_branch_trace); +#else +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +{ + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + + /* FIXME: Make this atomic! */ + if (val == expect) + f->correct++; + else + f->incorrect++; +} +EXPORT_SYMBOL(ftrace_likely_update); + +struct ftrace_pointer { + void *start; + void *stop; + int hit; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + const struct ftrace_pointer *f = m->private; + struct ftrace_branch_data *p = v; + + (*pos)++; + + if (v == (void *)1) + return f->start; + + ++p; + + if ((void *)p >= (void *)f->stop) + return NULL; + + return p; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + void *t = (void *)1; + loff_t l = 0; + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + const struct ftrace_pointer *fp = m->private; + struct ftrace_branch_data *p = v; + const char *f; + long percent; + + if (v == (void *)1) { + if (fp->hit) + seq_printf(m, " miss hit %% "); + else + seq_printf(m, " correct incorrect %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; + } + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + f++; + + /* + * The miss is overlayed on correct, and hit on incorrect. + */ + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; + + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + if (percent < 0) + seq_printf(m, " X "); + else + seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); + return 0; +} + +static struct seq_operations tracing_likely_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int tracing_branch_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &tracing_likely_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = (void *)inode->i_private; + } + + return ret; +} + +static const struct file_operations tracing_branch_fops = { + .open = tracing_branch_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +#ifdef CONFIG_PROFILE_ALL_BRANCHES +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; + +static const struct ftrace_pointer ftrace_branch_pos = { + .start = __start_branch_profile, + .stop = __stop_branch_profile, + .hit = 1, +}; + +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ + +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; + +static const struct ftrace_pointer ftrace_annotated_branch_pos = { + .start = __start_annotated_branch_profile, + .stop = __stop_annotated_branch_profile, +}; + +static __init int ftrace_branch_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, + (void *)&ftrace_annotated_branch_pos, + &tracing_branch_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'profile_annotatet_branch' entry\n"); + +#ifdef CONFIG_PROFILE_ALL_BRANCHES + entry = debugfs_create_file("profile_branch", 0444, d_tracer, + (void *)&ftrace_branch_pos, + &tracing_branch_fops); + if (!entry) + pr_warning("Could not create debugfs" + " 'profile_branch' entry\n"); +#endif + + return 0; +} + +device_initcall(ftrace_branch_init); diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c new file mode 100644 index 00000000000..23b76e4690e --- /dev/null +++ b/kernel/trace/trace_bts.c @@ -0,0 +1,276 @@ +/* + * BTS tracer + * + * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> + +#include <asm/ds.h> + +#include "trace.h" + + +#define SIZEOF_BTS (1 << 13) + +static DEFINE_PER_CPU(struct bts_tracer *, tracer); +static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + +#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_buffer per_cpu(buffer, smp_processor_id()) + + +/* + * Information to interpret a BTS record. + * This will go into an in-kernel BTS interface. + */ +static unsigned char sizeof_field; +static unsigned long debugctl_mask; + +#define sizeof_bts (3 * sizeof_field) + +static void bts_trace_cpuinit(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { + case 0x0 ... 0xC: + break; + case 0xD: + case 0xE: /* Pentium M */ + sizeof_field = sizeof(long); + debugctl_mask = (1<<6)|(1<<7); + break; + default: + sizeof_field = 8; + debugctl_mask = (1<<6)|(1<<7); + break; + } + break; + case 0xF: + switch (c->x86_model) { + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + sizeof_field = sizeof(long); + debugctl_mask = (1<<2)|(1<<3); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} + +static inline void bts_enable(void) +{ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask); +} + +static inline void bts_disable(void) +{ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask); +} + +static void bts_trace_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + +static void bts_trace_start_cpu(void *arg) +{ + this_tracer = + ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, + /* ovfl = */ NULL, /* th = */ (size_t)-1); + if (IS_ERR(this_tracer)) { + this_tracer = NULL; + return; + } + + bts_enable(); +} + +static void bts_trace_start(struct trace_array *tr) +{ + int cpu; + + bts_trace_reset(tr); + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +} + +static void bts_trace_stop_cpu(void *arg) +{ + if (this_tracer) { + bts_disable(); + + ds_release_bts(this_tracer); + this_tracer = NULL; + } +} + +static void bts_trace_stop(struct trace_array *tr) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); +} + +static int bts_trace_init(struct trace_array *tr) +{ + bts_trace_cpuinit(&boot_cpu_data); + bts_trace_reset(tr); + bts_trace_start(tr); + + return 0; +} + +static void bts_trace_print_header(struct seq_file *m) +{ +#ifdef __i386__ + seq_puts(m, "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, "# | | | |\n"); +#else + seq_puts(m, + "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, + "# | | | |\n"); +#endif +} + +static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *seq = &iter->seq; + struct bts_entry *it; + + trace_assign_type(it, entry); + + if (entry->type == TRACE_BTS) { + int ret; +#ifdef CONFIG_KALLSYMS + char function[KSYM_SYMBOL_LEN]; + sprint_symbol(function, it->from); +#else + char *function = "<unknown>"; +#endif + + ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n", + entry->cpu, it->from, it->to, function); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to) +{ + struct ring_buffer_event *event; + struct bts_entry *entry; + unsigned long irq; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_BTS; + entry->ent.cpu = smp_processor_id(); + entry->from = from; + entry->to = to; + ring_buffer_unlock_commit(tr->buffer, event, irq); +} + +static void trace_bts_at(struct trace_array *tr, size_t index) +{ + const void *raw = NULL; + unsigned long from, to; + int err; + + err = ds_access_bts(this_tracer, index, &raw); + if (err < 0) + return; + + from = *(const unsigned long *)raw; + to = *(const unsigned long *)((const char *)raw + sizeof_field); + + trace_bts(tr, from, to); +} + +static void trace_bts_cpu(void *arg) +{ + struct trace_array *tr = (struct trace_array *) arg; + size_t index = 0, end = 0, i; + int err; + + if (!this_tracer) + return; + + bts_disable(); + + err = ds_get_bts_index(this_tracer, &index); + if (err < 0) + goto out; + + err = ds_get_bts_end(this_tracer, &end); + if (err < 0) + goto out; + + for (i = index; i < end; i++) + trace_bts_at(tr, i); + + for (i = 0; i < index; i++) + trace_bts_at(tr, i); + +out: + bts_enable(); +} + +static void trace_bts_prepare(struct trace_iterator *iter) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); +} + +struct tracer bts_tracer __read_mostly = +{ + .name = "bts", + .init = bts_trace_init, + .reset = bts_trace_stop, + .print_header = bts_trace_print_header, + .print_line = bts_trace_print_line, + .start = bts_trace_start, + .stop = bts_trace_stop, + .open = trace_bts_prepare +}; + +__init static int init_bts_trace(void) +{ + return register_tracer(&bts_tracer); +} +device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0f85a64003d..e74f6d0a321 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -42,24 +42,20 @@ static void stop_function_trace(struct trace_array *tr) tracing_stop_cmdline_record(); } -static void function_trace_init(struct trace_array *tr) +static int function_trace_init(struct trace_array *tr) { - if (tr->ctrl) - start_function_trace(tr); + start_function_trace(tr); + return 0; } static void function_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_function_trace(tr); + stop_function_trace(tr); } -static void function_trace_ctrl_update(struct trace_array *tr) +static void function_trace_start(struct trace_array *tr) { - if (tr->ctrl) - start_function_trace(tr); - else - stop_function_trace(tr); + function_reset(tr); } static struct tracer function_trace __read_mostly = @@ -67,7 +63,7 @@ static struct tracer function_trace __read_mostly = .name = "function", .init = function_trace_init, .reset = function_trace_reset, - .ctrl_update = function_trace_ctrl_update, + .start = function_trace_start, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 00000000000..af60eef4cbc --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,611 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/fs.h> + +#include "trace.h" + +#define TRACE_GRAPH_INDENT 2 + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 + +static struct tracer_opt trace_opts[] = { + /* Display overruns ? */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + /* Don't display overruns and proc by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, + .opts = trace_opts +}; + +/* pid on the last trace processed */ +static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; + +static int graph_trace_init(struct trace_array *tr) +{ + int cpu, ret; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); + if (ret) + return ret; + tracing_start_cmdline_record(); + + return 0; +} + +static void graph_trace_reset(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(); +} + +static inline int log10_cpu(int nb) +{ + if (nb / 100) + return 3; + if (nb / 10) + return 2; + return 1; +} + +static enum print_line_t +print_graph_cpu(struct trace_seq *s, int cpu) +{ + int i; + int ret; + int log10_this = log10_cpu(cpu); + int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + + + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + ret = trace_seq_printf(s, " "); + + /* + * Tricky - we space the CPU field according to the max + * number of online CPUs. On a 2-cpu system it would take + * a maximum of 1 digit - on a 128 cpu system it would + * take up to 3 digits: + */ + for (i = 0; i < log10_all - log10_this; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + ret = trace_seq_printf(s, "%d) ", cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static enum print_line_t +print_graph_proc(struct trace_seq *s, pid_t pid) +{ + int i; + int ret; + int len; + char comm[8]; + int spaces = 0; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + + strncpy(comm, trace_find_cmdline(pid), 7); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%s-%s", comm, pid_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + + +/* If the pid changed since the last trace, output this event */ +static enum print_line_t +verif_pid(struct trace_seq *s, pid_t pid, int cpu) +{ + pid_t prev_pid; + int ret; + + if (last_pid[cpu] != -1 && last_pid[cpu] == pid) + return TRACE_TYPE_HANDLED; + + prev_pid = last_pid[cpu]; + last_pid[cpu] = pid; + +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + ret = trace_seq_printf(s, + " ------------------------------------------\n"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, prev_pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " => "); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, + "\n ------------------------------------------\n\n"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + return ret; +} + +static bool +trace_branch_is_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) +{ + struct ring_buffer_iter *ring_iter; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + ring_iter = iter->buffer_iter[iter->cpu]; + + if (!ring_iter) + return false; + + event = ring_buffer_iter_peek(ring_iter, NULL); + + if (!event) + return false; + + next = ring_buffer_event_data(event); + + if (next->ent.type != TRACE_GRAPH_RET) + return false; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return false; + + return true; +} + + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + /* log10(ULONG_MAX) + '\0' */ + char msecs_str[21]; + char nsecs_str[5]; + int ret, len; + int i; + + sprintf(msecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + ret = trace_seq_printf(s, msecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + len = strlen(msecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + ret = trace_seq_printf(s, ".%s", nsecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + len += strlen(nsecs_str); + } + + ret = trace_seq_printf(s, " us "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 7; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "| "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + +} + +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s) +{ + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + + return trace_seq_printf(s, " "); +} + +/* Case of a leaf function on its call entry */ +static enum print_line_t +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, struct trace_seq *s) +{ + struct ftrace_graph_ret_entry *ret_entry; + struct ftrace_graph_ret *graph_ret; + struct ring_buffer_event *event; + struct ftrace_graph_ent *call; + unsigned long long duration; + int ret; + int i; + + event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + ret_entry = ring_buffer_event_data(event); + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + + /* Overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "();\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, + struct trace_seq *s) +{ + int i; + int ret; + struct ftrace_graph_ent *call = &entry->graph_ent; + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No time */ + ret = trace_seq_printf(s, " | "); + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "() {\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, int cpu) +{ + int ret; + struct trace_entry *ent = iter->ent; + + /* Pid */ + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + if (trace_branch_is_leaf(iter, field)) + return print_graph_entry_leaf(iter, field, s); + else + return print_graph_entry_nested(field, s); + +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent, int cpu) +{ + int i; + int ret; + unsigned long long duration = trace->rettime - trace->calltime; + + /* Pid */ + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Closing brace */ + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Overrun */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_comment(struct print_entry *trace, struct trace_seq *s, + struct trace_entry *ent, struct trace_iterator *iter) +{ + int i; + int ret; + + /* Pid */ + if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, iter->cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No time */ + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Indentation */ + if (trace->depth > 0) + for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* The comment */ + ret = trace_seq_printf(s, "/* %s", trace->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (ent->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + ret = trace_seq_printf(s, " */\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + +enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_GRAPH_ENT: { + struct ftrace_graph_ent_entry *field; + trace_assign_type(field, entry); + return print_graph_entry(field, s, iter, + iter->cpu); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); + return print_graph_return(&field->ret, s, entry, iter->cpu); + } + case TRACE_PRINT: { + struct print_entry *field; + trace_assign_type(field, entry); + return print_graph_comment(field, s, entry, iter); + } + default: + return TRACE_TYPE_UNHANDLED; + } +} + +static void print_graph_headers(struct seq_file *s) +{ + /* 1st line */ + seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, "CPU "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, "TASK/PID "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) + seq_printf(s, "OVERHEAD/"); + seq_printf(s, "DURATION FUNCTION CALLS\n"); + + /* 2nd line */ + seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, "| "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, "| | "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + seq_printf(s, "| "); + seq_printf(s, "| | | | |\n"); + } else + seq_printf(s, " | | | | |\n"); +} +static struct tracer graph_trace __read_mostly = { + .name = "function_graph", + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, + .flags = &tracer_flags, +}; + +static __init int init_graph_trace(void) +{ + return register_tracer(&graph_trace); +} + +device_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9c74071c10e..7c2e326bbc8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ +/* + * save_tracer_enabled is used to save the state of the tracer_enabled + * variable when we disable it when we open a trace output file. + */ +static int save_tracer_enabled; + static void start_irqsoff_tracer(struct trace_array *tr) { register_ftrace_function(&trace_ops); - tracer_enabled = 1; + if (tracing_is_enabled()) { + tracer_enabled = 1; + save_tracer_enabled = 1; + } else { + tracer_enabled = 0; + save_tracer_enabled = 0; + } } static void stop_irqsoff_tracer(struct trace_array *tr) { tracer_enabled = 0; + save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); } @@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - - if (tr->ctrl) - start_irqsoff_tracer(tr); + start_irqsoff_tracer(tr); } static void irqsoff_tracer_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_irqsoff_tracer(tr); + stop_irqsoff_tracer(tr); } -static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +static void irqsoff_tracer_start(struct trace_array *tr) { - if (tr->ctrl) - start_irqsoff_tracer(tr); - else - stop_irqsoff_tracer(tr); + tracer_enabled = 1; + save_tracer_enabled = 1; +} + +static void irqsoff_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; + save_tracer_enabled = 0; } static void irqsoff_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_irqsoff_tracer(iter->tr); + tracer_enabled = 0; } static void irqsoff_tracer_close(struct trace_iterator *iter) { - if (iter->tr->ctrl) - start_irqsoff_tracer(iter->tr); + /* restart tracing */ + tracer_enabled = save_tracer_enabled; } #ifdef CONFIG_IRQSOFF_TRACER -static void irqsoff_tracer_init(struct trace_array *tr) +static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", .init = irqsoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly = #endif #ifdef CONFIG_PREEMPT_TRACER -static void preemptoff_tracer_init(struct trace_array *tr) +static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer preemptoff_tracer __read_mostly = @@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly = .name = "preemptoff", .init = preemptoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly = #if defined(CONFIG_IRQSOFF_TRACER) && \ defined(CONFIG_PREEMPT_TRACER) -static void preemptirqsoff_tracer_init(struct trace_array *tr) +static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .name = "preemptirqsoff", .init = preemptirqsoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index e62cbf78eab..2fb6da6523b 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -32,34 +32,29 @@ static void mmio_reset_data(struct trace_array *tr) tracing_reset(tr, cpu); } -static void mmio_trace_init(struct trace_array *tr) +static int mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } + + mmio_reset_data(tr); + enable_mmiotrace(); + return 0; } static void mmio_trace_reset(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) - disable_mmiotrace(); + + disable_mmiotrace(); mmio_reset_data(tr); mmio_trace_array = NULL; } -static void mmio_trace_ctrl_update(struct trace_array *tr) +static void mmio_trace_start(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } else { - disable_mmiotrace(); - } + mmio_reset_data(tr); } static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) @@ -296,10 +291,10 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, + .start = mmio_trace_start, .pipe_open = mmio_pipe_open, .close = mmio_close, .read = mmio_read, - .ctrl_update = mmio_trace_ctrl_update, .print_line = mmio_print_line, }; @@ -371,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map) int mmio_trace_printk(const char *fmt, va_list args) { - return trace_vprintk(0, fmt, args); + return trace_vprintk(0, -1, fmt, args); } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 4592b486251..b9767acd30a 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -12,6 +12,27 @@ #include "trace.h" +/* Our two options */ +enum { + TRACE_NOP_OPT_ACCEPT = 0x1, + TRACE_NOP_OPT_REFUSE = 0x2 +}; + +/* Options for the tracer (see trace_options file) */ +static struct tracer_opt nop_opts[] = { + /* Option that will be accepted by set_flag callback */ + { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, + /* Option that will be refused by set_flag callback */ + { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags nop_flags = { + /* You can check your flags value here when you want. */ + .val = 0, /* By default: all flags disabled */ + .opts = nop_opts +}; + static struct trace_array *ctx_trace; static void start_nop_trace(struct trace_array *tr) @@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr) /* Nothing to do! */ } -static void nop_trace_init(struct trace_array *tr) +static int nop_trace_init(struct trace_array *tr) { int cpu; ctx_trace = tr; @@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) tracing_reset(tr, cpu); - if (tr->ctrl) - start_nop_trace(tr); + start_nop_trace(tr); + return 0; } static void nop_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_nop_trace(tr); + stop_nop_trace(tr); } -static void nop_trace_ctrl_update(struct trace_array *tr) +/* It only serves as a signal handler and a callback to + * accept or refuse tthe setting of a flag. + * If you don't implement it, then the flag setting will be + * automatically accepted. + */ +static int nop_set_flag(u32 old_flags, u32 bit, int set) { - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_nop_trace(tr); - else - stop_nop_trace(tr); + /* + * Note that you don't need to update nop_flags.val yourself. + * The tracing Api will do it automatically if you return 0 + */ + if (bit == TRACE_NOP_OPT_ACCEPT) { + printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." + " Now cat trace_options to see the result\n", + set); + return 0; + } + + if (bit == TRACE_NOP_OPT_REFUSE) { + printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." + "Now cat trace_options to see the result\n", + set); + return -EINVAL; + } + + return 0; } + struct tracer nop_trace __read_mostly = { .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .ctrl_update = nop_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif + .flags = &nop_flags, + .set_flag = nop_set_flag }; diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c new file mode 100644 index 00000000000..a7172a352f6 --- /dev/null +++ b/kernel/trace/trace_power.c @@ -0,0 +1,179 @@ +/* + * ring buffer based C-state tracer + * + * Arjan van de Ven <arjan@linux.intel.com> + * Copyright (C) 2008 Intel Corporation + * + * Much is borrowed from trace_boot.c which is + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + */ + +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> +#include <linux/module.h> + +#include "trace.h" + +static struct trace_array *power_trace; +static int __read_mostly trace_power_enabled; + + +static void start_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 1; +} + +static void stop_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 0; +} + + +static int power_trace_init(struct trace_array *tr) +{ + int cpu; + power_trace = tr; + + trace_power_enabled = 1; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); + return 0; +} + +static enum print_line_t power_print_line(struct trace_iterator *iter) +{ + int ret = 0; + struct trace_entry *entry = iter->ent; + struct trace_power *field ; + struct power_trace *it; + struct trace_seq *s = &iter->seq; + struct timespec stamp; + struct timespec duration; + + trace_assign_type(field, entry); + it = &field->state_data; + stamp = ktime_to_timespec(it->stamp); + duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); + + if (entry->type == TRACE_POWER) { + if (it->type == POWER_CSTATE) + ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", + stamp.tv_sec, + stamp.tv_nsec, + it->state, iter->cpu, + duration.tv_sec, + duration.tv_nsec); + if (it->type == POWER_PSTATE) + ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", + stamp.tv_sec, + stamp.tv_nsec, + it->state, iter->cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer power_tracer __read_mostly = +{ + .name = "power", + .init = power_trace_init, + .start = start_power_trace, + .stop = stop_power_trace, + .reset = stop_power_trace, + .print_line = power_print_line, +}; + +static int init_power_trace(void) +{ + return register_tracer(&power_tracer); +} +device_initcall(init_power_trace); + +void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int level) +{ + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); +} +EXPORT_SYMBOL_GPL(trace_power_start); + + +void trace_power_end(struct power_trace *it) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + preempt_disable(); + it->end = ktime_get(); + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_POWER; + entry->state_data = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} +EXPORT_SYMBOL_GPL(trace_power_end); + +void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int level) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); + preempt_disable(); + it->end = it->stamp; + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_POWER; + entry->state_data = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} +EXPORT_SYMBOL_GPL(trace_power_mark); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b8f56beb1a6..863390557b4 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; -static atomic_t sched_ref; +static int sched_ref; +static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, @@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!atomic_read(&sched_ref)) + if (!sched_ref) return; tracing_record_cmdline(prev); @@ -123,20 +124,18 @@ static void tracing_sched_unregister(void) static void tracing_start_sched_switch(void) { - long ref; - - ref = atomic_inc_return(&sched_ref); - if (ref == 1) + mutex_lock(&sched_register_mutex); + if (!(sched_ref++)) tracing_sched_register(); + mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(void) { - long ref; - - ref = atomic_dec_and_test(&sched_ref); - if (ref) + mutex_lock(&sched_register_mutex); + if (!(--sched_ref)) tracing_sched_unregister(); + mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) @@ -149,40 +148,86 @@ void tracing_stop_cmdline_record(void) tracing_stop_sched_switch(); } +/** + * tracing_start_sched_switch_record - start tracing context switches + * + * Turns on context switch tracing for a tracer. + */ +void tracing_start_sched_switch_record(void) +{ + if (unlikely(!ctx_trace)) { + WARN_ON(1); + return; + } + + tracing_start_sched_switch(); + + mutex_lock(&sched_register_mutex); + tracer_enabled++; + mutex_unlock(&sched_register_mutex); +} + +/** + * tracing_stop_sched_switch_record - start tracing context switches + * + * Turns off context switch tracing for a tracer. + */ +void tracing_stop_sched_switch_record(void) +{ + mutex_lock(&sched_register_mutex); + tracer_enabled--; + WARN_ON(tracer_enabled < 0); + mutex_unlock(&sched_register_mutex); + + tracing_stop_sched_switch(); +} + +/** + * tracing_sched_switch_assign_trace - assign a trace array for ctx switch + * @tr: trace array pointer to assign + * + * Some tracers might want to record the context switches in their + * trace. This function lets those tracers assign the trace array + * to use. + */ +void tracing_sched_switch_assign_trace(struct trace_array *tr) +{ + ctx_trace = tr; +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); - tracing_start_cmdline_record(); - tracer_enabled = 1; + tracing_start_sched_switch_record(); } static void stop_sched_trace(struct trace_array *tr) { - tracer_enabled = 0; - tracing_stop_cmdline_record(); + tracing_stop_sched_switch_record(); } -static void sched_switch_trace_init(struct trace_array *tr) +static int sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; - - if (tr->ctrl) - start_sched_trace(tr); + start_sched_trace(tr); + return 0; } static void sched_switch_trace_reset(struct trace_array *tr) { - if (tr->ctrl) + if (sched_ref) stop_sched_trace(tr); } -static void sched_switch_trace_ctrl_update(struct trace_array *tr) +static void sched_switch_trace_start(struct trace_array *tr) { - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_sched_trace(tr); - else - stop_sched_trace(tr); + sched_switch_reset(tr); + tracing_start_sched_switch(); +} + +static void sched_switch_trace_stop(struct trace_array *tr) +{ + tracing_stop_sched_switch(); } static struct tracer sched_switch_trace __read_mostly = @@ -190,7 +235,8 @@ static struct tracer sched_switch_trace __read_mostly = .name = "sched_switch", .init = sched_switch_trace_init, .reset = sched_switch_trace_reset, - .ctrl_update = sched_switch_trace_ctrl_update, + .start = sched_switch_trace_start, + .stop = sched_switch_trace_stop, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sched_switch, #endif @@ -198,14 +244,6 @@ static struct tracer sched_switch_trace __read_mostly = __init static int init_sched_switch_trace(void) { - int ret = 0; - - if (atomic_read(&sched_ref)) - ret = tracing_sched_register(); - if (ret) { - pr_info("error registering scheduler trace\n"); - return ret; - } return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3ae93f16b56..0067b49746c 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); - /* - * To prevent recursion from the scheduler, if the - * resched flag was set before we entered, then - * don't reschedule. - */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = @@ -271,6 +262,12 @@ out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } +/* + * save_tracer_enabled is used to save the state of the tracer_enabled + * variable when we disable it when we open a trace output file. + */ +static int save_tracer_enabled; + static void start_wakeup_tracer(struct trace_array *tr) { int ret; @@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr) register_ftrace_function(&trace_ops); - tracer_enabled = 1; + if (tracing_is_enabled()) { + tracer_enabled = 1; + save_tracer_enabled = 1; + } else { + tracer_enabled = 0; + save_tracer_enabled = 0; + } return; fail_deprobe_wake_new: @@ -321,49 +324,53 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); } -static void wakeup_tracer_init(struct trace_array *tr) +static int wakeup_tracer_init(struct trace_array *tr) { wakeup_trace = tr; - - if (tr->ctrl) - start_wakeup_tracer(tr); + start_wakeup_tracer(tr); + return 0; } static void wakeup_tracer_reset(struct trace_array *tr) { - if (tr->ctrl) { - stop_wakeup_tracer(tr); - /* make sure we put back any tasks we are tracing */ - wakeup_reset(tr); - } + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); +} + +static void wakeup_tracer_start(struct trace_array *tr) +{ + wakeup_reset(tr); + tracer_enabled = 1; + save_tracer_enabled = 1; } -static void wakeup_tracer_ctrl_update(struct trace_array *tr) +static void wakeup_tracer_stop(struct trace_array *tr) { - if (tr->ctrl) - start_wakeup_tracer(tr); - else - stop_wakeup_tracer(tr); + tracer_enabled = 0; + save_tracer_enabled = 0; } static void wakeup_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_wakeup_tracer(iter->tr); + tracer_enabled = 0; } static void wakeup_tracer_close(struct trace_iterator *iter) { /* forget about any processes we were recording */ - if (iter->tr->ctrl) - start_wakeup_tracer(iter->tr); + if (save_tracer_enabled) { + wakeup_reset(iter->tr); + tracer_enabled = 1; + } } static struct tracer wakeup_tracer __read_mostly = @@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly = .name = "wakeup", .init = wakeup_tracer_init, .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, .open = wakeup_tracer_open, .close = wakeup_tracer_close, - .ctrl_update = wakeup_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 90bc752a758..88c8eb70f54 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_STACK: case TRACE_PRINT: case TRACE_SPECIAL: + case TRACE_BRANCH: return 1; } return 0; @@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) int cpu, ret = 0; /* Don't allow flipping of max traces now */ - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); if (count) *count = cnt; @@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } +static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) +{ + printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", + trace->name, init_ret); +} #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } /* Sleep for a 1/10 of a second */ msleep(100); @@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); ftrace_enabled = 0; /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); /* we should only have one item */ if (!ret && count != 1) { @@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ret = -1; goto out; } + out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; @@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 1; tracer_enabled = 1; - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); ftrace_enabled = 0; /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; /* disable interrupts for a bit */ @@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) udelay(100); local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) ret = trace_test_buffer(&max_tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) unsigned long count; int ret; + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; /* disable preemption for a bit */ @@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) udelay(100); preempt_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) ret = trace_test_buffer(&max_tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * unsigned long count; int ret; + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } /* reset the max latency */ tracing_max_latency = 0; @@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); - if (ret) + if (ret) { + tracing_start(); goto out; + } ret = trace_test_buffer(&max_tr, &count); - if (ret) + if (ret) { + tracing_start(); goto out; + } if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; + tracing_start(); goto out; } /* do the test by disabling interrupts first this time */ tracing_max_latency = 0; - tr->ctrl = 1; - trace->ctrl_update(tr); + tracing_start(); preempt_disable(); local_irq_disable(); udelay(100); @@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (ret) @@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * out: trace->reset(tr); + tracing_start(); tracing_max_latency = save_max; return ret; @@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wait_for_completion(&isrt); /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; @@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) @@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); + tracing_start(); tracing_max_latency = save_max; @@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr int ret; /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return 0; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); return ret; } #endif /* CONFIG_SYSPROF_TRACER */ + +#ifdef CONFIG_BRANCH_TRACER +int +trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + return ret; +} +#endif /* CONFIG_BRANCH_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3bdb44bde4b..0b863f2cbc8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -48,7 +48,7 @@ static inline void check_stack(void) if (!object_is_on_stack(&this_size)) return; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); /* a race could have already updated it */ @@ -78,6 +78,7 @@ static inline void check_stack(void) * on a new max, so it is far from a fast path. */ while (i < max_stack_trace.nr_entries) { + int found = 0; stack_dump_index[i] = this_size; p = start; @@ -86,17 +87,19 @@ static inline void check_stack(void) if (*p == stack_dump_trace[i]) { this_size = stack_dump_index[i++] = (top - p) * sizeof(unsigned long); + found = 1; /* Start the search from here */ start = p + 1; } } - i++; + if (!found) + i++; } out: __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static void @@ -107,8 +110,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -120,10 +122,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = @@ -166,11 +165,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); *ptr = val; __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); return count; } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 9587d3bcba5..54960edb96d 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -261,27 +261,17 @@ static void stop_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static void stack_trace_init(struct trace_array *tr) +static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; - if (tr->ctrl) - start_stack_trace(tr); + start_stack_trace(tr); + return 0; } static void stack_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_stack_trace(tr); -} - -static void stack_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_stack_trace(tr); - else - stop_stack_trace(tr); + stop_stack_trace(tr); } static struct tracer stack_trace __read_mostly = @@ -289,7 +279,6 @@ static struct tracer stack_trace __read_mostly = .name = "sysprof", .init = stack_trace_init, .reset = stack_trace_reset, - .ctrl_update = stack_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sysprof, #endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index af8c8566488..79602740bbb 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex); */ #define TRACEPOINT_HASH_BITS 6 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : @@ -54,40 +55,43 @@ struct tracepoint_entry { struct hlist_node hlist; void **funcs; int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - unsigned char rcu_pending:1; char name[0]; }; -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +struct tp_probes { + union { + struct rcu_head rcu; + struct list_head list; + } u; + void *probes[0]; +}; -static void free_old_closure(struct rcu_head *head) +static inline void *allocate_probes(int count) { - struct tracepoint_entry *entry = container_of(head, - struct tracepoint_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; + struct tp_probes *p = kmalloc(count * sizeof(void *) + + sizeof(struct tp_probes), GFP_KERNEL); + return p == NULL ? NULL : p->probes; } -static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +static void rcu_free_old_probes(struct rcu_head *head) { - if (!old) - return; - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); + kfree(container_of(head, struct tp_probes, u.rcu)); +} + +static inline void release_probes(void *old) +{ + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); + } } static void debug_print_probes(struct tracepoint_entry *entry) { int i; - if (!tracepoint_debug) + if (!tracepoint_debug || !entry->funcs) return; for (i = 0; entry->funcs[i]; i++) @@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(void *)); new[nr_probes] = probe; + new[nr_probes + 1] = NULL; entry->refcount = nr_probes + 1; entry->funcs = new; debug_print_probes(entry); @@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; if (!old) - return NULL; + return ERR_PTR(-ENOENT); debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ @@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes - nr_del + 1); if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i]; i++) if ((probe && old[i] != probe)) new[j++] = old[i]; + new[nr_probes - nr_del] = NULL; entry->refcount = nr_probes - nr_del; entry->funcs = new; } @@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name) memcpy(&e->name[0], name, name_len); e->funcs = NULL; e->refcount = 0; - e->rcu_pending = 0; hlist_add_head(&e->hlist, head); return e; } @@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name) * Remove the tracepoint from the tracepoint hash table. Must be called with * mutex_lock held. */ -static int remove_tracepoint(const char *name) +static inline void remove_tracepoint(struct tracepoint_entry *e) { - struct hlist_head *head; - struct hlist_node *node; - struct tracepoint_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->refcount) - return -EBUSY; hlist_del(&e->hlist); - /* Make sure the call_rcu_sched has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); kfree(e); - return 0; } /* @@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry, static void disable_tracepoint(struct tracepoint *elem) { elem->state = 0; + rcu_assign_pointer(elem->funcs, NULL); } /** @@ -320,6 +303,23 @@ static void tracepoint_update_probes(void) module_update_tracepoints(); } +static void *tracepoint_add_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) + return entry; + } + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old) && !entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_register - Connect a probe to a tracepoint * @name: tracepoint name @@ -330,44 +330,36 @@ static void tracepoint_update_probes(void) */ int tracepoint_probe_register(const char *name, void *probe) { - struct tracepoint_entry *entry; - int ret = 0; void *old; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) { - ret = PTR_ERR(entry); - goto end; - } - } - /* - * If we detect that a call_rcu_sched is pending for this tracepoint, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = tracepoint_entry_add_probe(entry, probe); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } + old = tracepoint_add_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ - mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - WARN_ON(!entry); - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); -end: - mutex_unlock(&tracepoints_mutex); - return ret; + release_probes(old); + return 0; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); +static void *tracepoint_remove_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) + return ERR_PTR(-ENOENT); + old = tracepoint_entry_remove_probe(entry, probe); + if (IS_ERR(old)) + return old; + if (!entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @name: tracepoint name @@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); */ int tracepoint_probe_unregister(const char *name, void *probe) { - struct tracepoint_entry *entry; void *old; - int ret = -ENOENT; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = tracepoint_entry_remove_probe(entry, probe); - if (!old) { - printk(KERN_WARNING "Warning: Trying to unregister a probe" - "that doesn't exist\n"); - goto end; - } + old = tracepoint_remove_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ + release_probes(old); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +static LIST_HEAD(old_probes); +static int need_update; + +static void tracepoint_add_old_probes(void *old) +{ + need_update = 1; + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + list_add(&tp_probes->u.list, &old_probes); + } +} + +/** + * tracepoint_probe_register_noupdate - register a probe but not connect + * @name: tracepoint name + * @probe: probe handler + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_register_noupdate(const char *name, void *probe) +{ + void *old; + mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); - remove_tracepoint(name); /* Ignore busy error message */ - ret = 0; -end: + old = tracepoint_add_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); mutex_unlock(&tracepoints_mutex); - return ret; + return 0; } -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); + +/** + * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect + * @name: tracepoint name + * @probe: probe function pointer + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_unregister_noupdate(const char *name, void *probe) +{ + void *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_remove_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); + +/** + * tracepoint_probe_update_all - update tracepoints + */ +void tracepoint_probe_update_all(void) +{ + LIST_HEAD(release_probes); + struct tp_probes *pos, *next; + + mutex_lock(&tracepoints_mutex); + if (!need_update) { + mutex_unlock(&tracepoints_mutex); + return; + } + if (!list_empty(&old_probes)) + list_replace_init(&old_probes, &release_probes); + need_update = 0; + mutex_unlock(&tracepoints_mutex); + + tracepoint_update_probes(); + list_for_each_entry_safe(pos, next, &release_probes, u.list) { + list_del(&pos->u.list); + call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); + } +} +EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); /** * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. @@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); + +#ifdef CONFIG_MODULES + +int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + break; + case MODULE_STATE_GOING: + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + break; + } + return 0; +} + +struct notifier_block tracepoint_module_nb = { + .notifier_call = tracepoint_module_notify, + .priority = 0, +}; + +static int init_tracepoints(void) +{ + return register_module_notifier(&tracepoint_module_nb); +} +__initcall(init_tracepoints); + +#endif /* CONFIG_MODULES */ diff --git a/kernel/user.c b/kernel/user.c index 39d6159fae4..cec2224bc9f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up) if (IS_ERR(up->tg)) rc = -ENOMEM; + set_tg_uid(up); + return rc; } diff --git a/lib/Kconfig b/lib/Kconfig index 85cf7ea978a..7823f8342ab 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -157,4 +157,11 @@ config CHECK_SIGNATURE config HAVE_LMB boolean +config CPUMASK_OFFSTACK + bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS + help + Use dynamic allocation for cpumask_var_t, instead of putting + them on the stack. This is a bit more expensive, but avoids + stack overflow. + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b0f239e443b..1e3fd3e3436 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -545,6 +545,16 @@ config DEBUG_SG If unsure, say N. +config DEBUG_NOTIFIERS + bool "Debug notifier call chains" + depends on DEBUG_KERNEL + help + Enable this to turn on sanity checking for notifier call chains. + This is most useful for kernel developers to make sure that + modules properly unregister themselves from notifier chains. + This is a relatively cheap check but if you care about maximum + performance, say N. + config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && \ diff --git a/mm/bounce.c b/mm/bounce.c index 06722c40305..bf0cf7c8387 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,6 +14,7 @@ #include <linux/hash.h> #include <linux/highmem.h> #include <linux/blktrace_api.h> +#include <trace/block.h> #include <asm/tlbflush.h> #define POOL_SIZE 64 @@ -21,6 +22,8 @@ static mempool_t *page_pool, *isa_page_pool; +DEFINE_TRACE(block_bio_bounce); + #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { @@ -222,7 +225,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!bio) return; - blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + trace_block_bio_bounce(q, *bio_orig); /* * at least one page was bounced, fill in possible non-highmem diff --git a/mm/memory.c b/mm/memory.c index 164951c4730..fc031d68327 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3049,3 +3049,18 @@ void print_vma_addr(char *prefix, unsigned long ip) } up_read(¤t->mm->mmap_sem); } + +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void) +{ + might_sleep(); + /* + * it would be nicer only to annotate paths which are not under + * pagefault_disable, however that requires a larger audit and + * providing helpers like get_user_atomic. + */ + if (!in_atomic() && current->mm) + might_lock_read(¤t->mm->mmap_sem); +} +EXPORT_SYMBOL(might_fault); +#endif diff --git a/mm/slub.c b/mm/slub.c index a2cd47d89e0..8e516e29f98 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3626,7 +3626,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->cpus); + &l->cpus); } if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h index 0216b55bd64..01724e04c55 100644 --- a/samples/tracepoints/tp-samples-trace.h +++ b/samples/tracepoints/tp-samples-trace.h @@ -4,10 +4,10 @@ #include <linux/proc_fs.h> /* for struct inode and struct file */ #include <linux/tracepoint.h> -DEFINE_TRACE(subsys_event, +DECLARE_TRACE(subsys_event, TPPROTO(struct inode *inode, struct file *file), TPARGS(inode, file)); -DEFINE_TRACE(subsys_eventb, +DECLARE_TRACE(subsys_eventb, TPPROTO(void), TPARGS()); #endif diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c index 55abfdda4bd..e3a964889dc 100644 --- a/samples/tracepoints/tracepoint-probe-sample.c +++ b/samples/tracepoints/tracepoint-probe-sample.c @@ -46,6 +46,7 @@ void __exit tp_sample_trace_exit(void) { unregister_trace_subsys_eventb(probe_subsys_eventb); unregister_trace_subsys_event(probe_subsys_event); + tracepoint_synchronize_unregister(); } module_exit(tp_sample_trace_exit); diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c index 5e9fcf4afff..685a5acb456 100644 --- a/samples/tracepoints/tracepoint-probe-sample2.c +++ b/samples/tracepoints/tracepoint-probe-sample2.c @@ -33,6 +33,7 @@ module_init(tp_sample_trace_init); void __exit tp_sample_trace_exit(void) { unregister_trace_subsys_event(probe_subsys_event); + tracepoint_synchronize_unregister(); } module_exit(tp_sample_trace_exit); diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c index 4ae4b7fcc04..00d169792a3 100644 --- a/samples/tracepoints/tracepoint-sample.c +++ b/samples/tracepoints/tracepoint-sample.c @@ -13,6 +13,9 @@ #include <linux/proc_fs.h> #include "tp-samples-trace.h" +DEFINE_TRACE(subsys_event); +DEFINE_TRACE(subsys_eventb); + struct proc_dir_entry *pentry_example; static int my_open(struct inode *inode, struct file *file) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 468fbc9016c..7a176773af8 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -198,16 +198,10 @@ cmd_modversions = \ fi; endif -ifdef CONFIG_64BIT -arch_bits = 64 -else -arch_bits = 32 -endif - ifdef CONFIG_FTRACE_MCOUNT_RECORD -cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \ - "$(ARCH)" "$(arch_bits)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" \ - "$(NM)" "$(RM)" "$(MV)" "$(@)"; +cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ + "$(if $(CONFIG_64BIT),64,32)" \ + "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)"; endif define rule_cc_o_c diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl index d2c61efc216..f0af9aa9b24 100644 --- a/scripts/bootgraph.pl +++ b/scripts/bootgraph.pl @@ -78,11 +78,13 @@ while (<>) { } if ($count == 0) { - print "No data found in the dmesg. Make sure that 'printk.time=1' and\n"; - print "'initcall_debug' are passed on the kernel command line.\n\n"; - print "Usage: \n"; - print " dmesg | perl scripts/bootgraph.pl > output.svg\n\n"; - exit; + print STDERR <<END; +No data found in the dmesg. Make sure that 'printk.time=1' and +'initcall_debug' are passed on the kernel command line. +Usage: + dmesg | perl scripts/bootgraph.pl > output.svg +END + exit 1; } print "<?xml version=\"1.0\" standalone=\"no\"?> \n"; @@ -109,8 +111,8 @@ my $stylecounter = 0; my %rows; my $rowscount = 1; my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start); -my $key; -foreach $key (@initcalls) { + +foreach my $key (@initcalls) { my $duration = $end{$key} - $start{$key}; if ($duration >= $threshold) { diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 6b9fe3eb836..0b1dc9f9bb0 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -112,6 +112,8 @@ my ($arch, $bits, $objdump, $objcopy, $cc, # Acceptable sections to record. my %text_sections = ( ".text" => 1, + ".sched.text" => 1, + ".spinlock.text" => 1, ); $objdump = "objdump" if ((length $objdump) == 0); @@ -130,10 +132,13 @@ my %weak; # List of weak functions my %convert; # List of local functions used that needs conversion my $type; +my $nm_regex; # Find the local functions (return function) my $section_regex; # Find the start of a section my $function_regex; # Find the name of a function # (return offset and func name) my $mcount_regex; # Find the call site to mcount (return offset) +my $alignment; # The .align value to use for $mcount_section +my $section_type; # Section header plus possible alignment command if ($arch eq "x86") { if ($bits == 64) { @@ -143,11 +148,21 @@ if ($arch eq "x86") { } } +# +# We base the defaults off of i386, the other archs may +# feel free to change them in the below if statements. +# +$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; +$section_regex = "Disassembly of section\\s+(\\S+):"; +$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; +$mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; +$section_type = '@progbits'; +$type = ".long"; + if ($arch eq "x86_64") { - $section_regex = "Disassembly of section\\s+(\\S+):"; - $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; $type = ".quad"; + $alignment = 8; # force flags for this arch $ld .= " -m elf_x86_64"; @@ -156,10 +171,7 @@ if ($arch eq "x86_64") { $cc .= " -m64"; } elsif ($arch eq "i386") { - $section_regex = "Disassembly of section\\s+(\\S+):"; - $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; - $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; - $type = ".long"; + $alignment = 4; # force flags for this arch $ld .= " -m elf_i386"; @@ -167,6 +179,27 @@ if ($arch eq "x86_64") { $objcopy .= " -O elf32-i386"; $cc .= " -m32"; +} elsif ($arch eq "sh") { + $alignment = 2; + + # force flags for this arch + $ld .= " -m shlelf_linux"; + $objcopy .= " -O elf32-sh-linux"; + $cc .= " -m32"; + +} elsif ($arch eq "powerpc") { + $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; + $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$"; + + if ($bits == 64) { + $type = ".quad"; + } + +} elsif ($arch eq "arm") { + $alignment = 2; + $section_type = '%progbits'; + } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } @@ -236,7 +269,7 @@ if (!$found_version) { # open (IN, "$nm $inputfile|") || die "error running $nm"; while (<IN>) { - if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) { + if (/$nm_regex/) { $locals{$1} = 1; } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) { $weak{$2} = $1; @@ -287,7 +320,8 @@ sub update_funcs if (!$opened) { open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; $opened = 1; - print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; + print FILE "\t.section $mcount_section,\"a\",$section_type\n"; + print FILE "\t.align $alignment\n" if (defined($alignment)); } printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; } diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl new file mode 100644 index 00000000000..4f729b3501e --- /dev/null +++ b/scripts/trace/power.pl @@ -0,0 +1,108 @@ +#!/usr/bin/perl + +# Copyright 2008, Intel Corporation +# +# This file is part of the Linux kernel +# +# This program file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program in a file named COPYING; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301 USA +# +# Authors: +# Arjan van de Ven <arjan@linux.intel.com> + + +# +# This script turns a cstate ftrace output into a SVG graphic that shows +# historic C-state information +# +# +# cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg +# + +my @styles; +my $base = 0; + +my @pstate_last; +my @pstate_level; + +$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; + + +print "<?xml version=\"1.0\" standalone=\"no\"?> \n"; +print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n"; + +my $scale = 30000.0; +while (<>) { + my $line = $_; + if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) { + if ($base == 0) { + $base = $1; + } + my $time = $1 - $base; + $time = $time * $scale; + my $C = $2; + my $cpu = $3; + my $y = 400 * $cpu; + my $duration = $4 * $scale; + my $msec = int($4 * 100000)/100.0; + my $height = $C * 20; + $style = $styles[$C]; + + $y = $y + 140 - $height; + + $x2 = $time + 4; + $y2 = $y + 4; + + + print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n"; + print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n"; + } + if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) { + my $time = $1 - $base; + my $state = $2; + my $cpu = $3; + + if (defined($pstate_last[$cpu])) { + my $from = $pstate_last[$cpu]; + my $oldstate = $pstate_state[$cpu]; + my $duration = ($time-$from) * $scale; + + $from = $from * $scale; + my $to = $from + $duration; + my $height = 140 - ($oldstate * (140/8)); + + my $y = 400 * $cpu + 200 + $height; + my $y2 = $y+4; + my $style = $styles[8]; + + print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n"; + print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n"; + }; + + $pstate_last[$cpu] = $time; + $pstate_state[$cpu] = $state; + } +} + + +print "</svg>\n"; diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py new file mode 100644 index 00000000000..902f9a99262 --- /dev/null +++ b/scripts/tracing/draw_functrace.py @@ -0,0 +1,130 @@ +#!/usr/bin/python + +""" +Copyright 2008 (c) Frederic Weisbecker <fweisbec@gmail.com> +Licensed under the terms of the GNU GPL License version 2 + +This script parses a trace provided by the function tracer in +kernel/trace/trace_functions.c +The resulted trace is processed into a tree to produce a more human +view of the call stack by drawing textual but hierarchical tree of +calls. Only the functions's names and the the call time are provided. + +Usage: + Be sure that you have CONFIG_FUNCTION_TRACER + # mkdir /debugfs + # mount -t debug debug /debug + # echo function > /debug/tracing/current_tracer + $ cat /debug/tracing/trace_pipe > ~/raw_trace_func + Wait some times but not too much, the script is a bit slow. + Break the pipe (Ctrl + Z) + $ scripts/draw_functrace.py < raw_trace_func > draw_functrace + Then you have your drawn trace in draw_functrace +""" + + +import sys, re + +class CallTree: + """ This class provides a tree representation of the functions + call stack. If a function has no parent in the kernel (interrupt, + syscall, kernel thread...) then it is attached to a virtual parent + called ROOT. + """ + ROOT = None + + def __init__(self, func, time = None, parent = None): + self._func = func + self._time = time + if parent is None: + self._parent = CallTree.ROOT + else: + self._parent = parent + self._children = [] + + def calls(self, func, calltime): + """ If a function calls another one, call this method to insert it + into the tree at the appropriate place. + @return: A reference to the newly created child node. + """ + child = CallTree(func, calltime, self) + self._children.append(child) + return child + + def getParent(self, func): + """ Retrieve the last parent of the current node that + has the name given by func. If this function is not + on a parent, then create it as new child of root + @return: A reference to the parent. + """ + tree = self + while tree != CallTree.ROOT and tree._func != func: + tree = tree._parent + if tree == CallTree.ROOT: + child = CallTree.ROOT.calls(func, None) + return child + return tree + + def __repr__(self): + return self.__toString("", True) + + def __toString(self, branch, lastChild): + if self._time is not None: + s = "%s----%s (%s)\n" % (branch, self._func, self._time) + else: + s = "%s----%s\n" % (branch, self._func) + + i = 0 + if lastChild: + branch = branch[:-1] + " " + while i < len(self._children): + if i != len(self._children) - 1: + s += "%s" % self._children[i].__toString(branch +\ + " |", False) + else: + s += "%s" % self._children[i].__toString(branch +\ + " |", True) + i += 1 + return s + +class BrokenLineException(Exception): + """If the last line is not complete because of the pipe breakage, + we want to stop the processing and ignore this line. + """ + pass + +class CommentLineException(Exception): + """ If the line is a comment (as in the beginning of the trace file), + just ignore it. + """ + pass + + +def parseLine(line): + line = line.strip() + if line.startswith("#"): + raise CommentLineException + m = re.match("[^]]+?\\] +([0-9.]+): (\\w+) <-(\\w+)", line) + if m is None: + raise BrokenLineException + return (m.group(1), m.group(2), m.group(3)) + + +def main(): + CallTree.ROOT = CallTree("Root (Nowhere)", None, None) + tree = CallTree.ROOT + + for line in sys.stdin: + try: + calltime, callee, caller = parseLine(line) + except BrokenLineException: + break + except CommentLineException: + continue + tree = tree.getParent(caller) + tree = tree.calls(callee, calltime) + + print CallTree.ROOT + +if __name__ == "__main__": + main() |