diff options
62 files changed, 4895 insertions, 1454 deletions
diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace new file mode 100644 index 00000000000..5e6a92a02d8 --- /dev/null +++ b/Documentation/ABI/testing/debugfs-kmemtrace @@ -0,0 +1,71 @@ +What: /sys/kernel/debug/kmemtrace/ +Date: July 2008 +Contact: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> +Description: + +In kmemtrace-enabled kernels, the following files are created: + +/sys/kernel/debug/kmemtrace/ + cpu<n> (0400) Per-CPU tracing data, see below. (binary) + total_overruns (0400) Total number of bytes which were dropped from + cpu<n> files because of full buffer condition, + non-binary. (text) + abi_version (0400) Kernel's kmemtrace ABI version. (text) + +Each per-CPU file should be read according to the relay interface. That is, +the reader should set affinity to that specific CPU and, as currently done by +the userspace application (though there are other methods), use poll() with +an infinite timeout before every read(). Otherwise, erroneous data may be +read. The binary data has the following _core_ format: + + Event ID (1 byte) Unsigned integer, one of: + 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) + 1 - represents a freeing of previously allocated memory + (KMEMTRACE_EVENT_FREE) + Type ID (1 byte) Unsigned integer, one of: + 0 - this is a kmalloc() / kfree() + 1 - this is a kmem_cache_alloc() / kmem_cache_free() + 2 - this is a __get_free_pages() et al. + Event size (2 bytes) Unsigned integer representing the + size of this event. Used to extend + kmemtrace. Discard the bytes you + don't know about. + Sequence number (4 bytes) Signed integer used to reorder data + logged on SMP machines. Wraparound + must be taken into account, although + it is unlikely. + Caller address (8 bytes) Return address to the caller. + Pointer to mem (8 bytes) Pointer to target memory area. Can be + NULL, but not all such calls might be + recorded. + +In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: + + Requested bytes (8 bytes) Total number of requested bytes, + unsigned, must not be zero. + Allocated bytes (8 bytes) Total number of actually allocated + bytes, unsigned, must not be lower + than requested bytes. + Requested flags (4 bytes) GFP flags supplied by the caller. + Target CPU (4 bytes) Signed integer, valid for event id 1. + If equal to -1, target CPU is the same + as origin CPU, but the reverse might + not be true. + +The data is made available in the same endianness the machine has. + +Other event ids and type ids may be defined and added. Other fields may be +added by increasing event size, but see below for details. +Every modification to the ABI, including new id definitions, are followed +by bumping the ABI version by one. + +Adding new data to the packet (features) is done at the end of the mandatory +data: + Feature size (2 byte) + Feature ID (1 byte) + Feature data (Feature size - 3 bytes) + + +Users: + kmemtrace-user - git://repo.or.cz/kmemtrace-user.git + diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 803b1318b13..758fb42a1b6 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -165,6 +165,8 @@ Here is the list of current tracers that may be configured. nop - This is not a tracer. To remove all tracers from tracing simply echo "nop" into current_tracer. + hw-branch-tracer - traces branches on all cpu's in a circular buffer. + Examples of using the tracer ---------------------------- @@ -1152,6 +1154,78 @@ int main (int argc, char **argv) return 0; } + +hw-branch-tracer (x86 only) +--------------------------- + +This tracer uses the x86 last branch tracing hardware feature to +collect a branch trace on all cpus with relatively low overhead. + +The tracer uses a fixed-size circular buffer per cpu and only +traces ring 0 branches. The trace file dumps that buffer in the +following format: + +# tracer: hw-branch-tracer +# +# CPU# TO <- FROM + 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6 + 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a + 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf + 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf + 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a + 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf + + +The tracer may be used to dump the trace for the oops'ing cpu on a +kernel oops into the system log. To enable this, ftrace_dump_on_oops +must be set. To set ftrace_dump_on_oops, one can either use the sysctl +function or set it via the proc system interface. + + sysctl kernel.ftrace_dump_on_oops=1 + +or + + echo 1 > /proc/sys/kernel/ftrace_dump_on_oops + + +Here's an example of such a dump after a null pointer dereference in a +kernel module: + +[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 +[57848.106019] IP: [<ffffffffa0000006>] open+0x6/0x14 [oops] +[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0 +[57848.106019] Oops: 0002 [#1] SMP +[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus +[57848.106019] Dumping ftrace buffer: +[57848.106019] --------------------------------- +[...] +[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24 +[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165 +[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165 +[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165 +[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165 +[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops] +[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30 +[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b +[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31 +[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1 +[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30 +[...] +[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2 +[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881 +[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881 +[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96 +[...] +[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3 +[57848.106019] --------------------------------- +[57848.106019] CPU 0 +[57848.106019] Modules linked in: oops +[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23 +[57848.106019] RIP: 0010:[<ffffffffa0000006>] [<ffffffffa0000006>] open+0x6/0x14 [oops] +[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246 +[...] + + dynamic ftrace -------------- diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d8362cf9909..6390ffb520f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -49,6 +49,7 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. + KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1043,6 +1044,15 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. + kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } + Controls whether kmemtrace is enabled + at boot-time. + + kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of + subbufs kmemtrace's relay channel has. Set this + higher than default (KMEMTRACE_N_SUBBUFS in code) if + you experience buffer overruns. + movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter is similar to kernelcore except it specifies the amount of memory used for migratable allocations. diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 9e592c718af..535aeb936db 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -113,6 +113,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'x' - Used by xmon interface on ppc/powerpc platforms. +'z' - Dump the ftrace buffer + '0'-'9' - Sets the console log level, controlling which kernel messages will be printed to your console. ('0', for example would make it so that only emergency messages like PANICs or OOPSes would diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index cde23b4a12a..5731c67abc5 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -78,12 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If events were lost, the trace is incomplete. You should enlarge the buffers and try again. Buffers are enlarged by first seeing how large the current buffers are: -$ cat /debug/tracing/trace_entries +$ cat /debug/tracing/buffer_size_kb gives you a number. Approximately double this number and write it back, for instance: -$ echo 0 > /debug/tracing/tracing_enabled -$ echo 128000 > /debug/tracing/trace_entries -$ echo 1 > /debug/tracing/tracing_enabled +$ echo 128000 > /debug/tracing/buffer_size_kb Then start again from the top. If you are doing a trace for a driver project, e.g. Nouveau, you should also diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt new file mode 100644 index 00000000000..a956d9b7f94 --- /dev/null +++ b/Documentation/vm/kmemtrace.txt @@ -0,0 +1,126 @@ + kmemtrace - Kernel Memory Tracer + + by Eduard - Gabriel Munteanu + <eduard.munteanu@linux360.ro> + +I. Introduction +=============== + +kmemtrace helps kernel developers figure out two things: +1) how different allocators (SLAB, SLUB etc.) perform +2) how kernel code allocates memory and how much + +To do this, we trace every allocation and export information to the userspace +through the relay interface. We export things such as the number of requested +bytes, the number of bytes actually allocated (i.e. including internal +fragmentation), whether this is a slab allocation or a plain kmalloc() and so +on. + +The actual analysis is performed by a userspace tool (see section III for +details on where to get it from). It logs the data exported by the kernel, +processes it and (as of writing this) can provide the following information: +- the total amount of memory allocated and fragmentation per call-site +- the amount of memory allocated and fragmentation per allocation +- total memory allocated and fragmentation in the collected dataset +- number of cross-CPU allocation and frees (makes sense in NUMA environments) + +Moreover, it can potentially find inconsistent and erroneous behavior in +kernel code, such as using slab free functions on kmalloc'ed memory or +allocating less memory than requested (but not truly failed allocations). + +kmemtrace also makes provisions for tracing on some arch and analysing the +data on another. + +II. Design and goals +==================== + +kmemtrace was designed to handle rather large amounts of data. Thus, it uses +the relay interface to export whatever is logged to userspace, which then +stores it. Analysis and reporting is done asynchronously, that is, after the +data is collected and stored. By design, it allows one to log and analyse +on different machines and different arches. + +As of writing this, the ABI is not considered stable, though it might not +change much. However, no guarantees are made about compatibility yet. When +deemed stable, the ABI should still allow easy extension while maintaining +backward compatibility. This is described further in Documentation/ABI. + +Summary of design goals: + - allow logging and analysis to be done across different machines + - be fast and anticipate usage in high-load environments (*) + - be reasonably extensible + - make it possible for GNU/Linux distributions to have kmemtrace + included in their repositories + +(*) - one of the reasons Pekka Enberg's original userspace data analysis + tool's code was rewritten from Perl to C (although this is more than a + simple conversion) + + +III. Quick usage guide +====================== + +1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable +CONFIG_KMEMTRACE). + +2) Get the userspace tool and build it: +$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository +$ cd kmemtrace-user/ +$ ./autogen.sh +$ ./configure +$ make + +3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the +'single' runlevel (so that relay buffers don't fill up easily), and run +kmemtrace: +# '$' does not mean user, but root here. +$ mount -t debugfs none /sys/kernel/debug +$ mount -t proc none /proc +$ cd path/to/kmemtrace-user/ +$ ./kmemtraced +Wait a bit, then stop it with CTRL+C. +$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't + # overrun, should + # be zero. +$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to + check its correctness] +$ ./kmemtrace-report + +Now you should have a nice and short summary of how the allocator performs. + +IV. FAQ and known issues +======================== + +Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix +this? Should I worry? +A: If it's non-zero, this affects kmemtrace's accuracy, depending on how +large the number is. You can fix it by supplying a higher +'kmemtrace.subbufs=N' kernel parameter. +--- + +Q: kmemtrace_check reports errors, how do I fix this? Should I worry? +A: This is a bug and should be reported. It can occur for a variety of +reasons: + - possible bugs in relay code + - possible misuse of relay by kmemtrace + - timestamps being collected unorderly +Or you may fix it yourself and send us a patch. +--- + +Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? +A: This is a known issue and I'm working on it. These might be true errors +in kernel code, which may have inconsistent behavior (e.g. allocating memory +with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed +out this behavior may work with SLAB, but may fail with other allocators. + +It may also be due to lack of tracing in some unusual allocator functions. + +We don't want bug reports regarding this issue yet. +--- + +V. See also +=========== + +Documentation/kernel-parameters.txt +Documentation/ABI/testing/debugfs-kmemtrace + diff --git a/MAINTAINERS b/MAINTAINERS index 5bff376d297..829a697f123 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2606,6 +2606,12 @@ M: jason.wessel@windriver.com L: kgdb-bugreport@lists.sourceforge.net S: Maintained +KMEMTRACE +P: Eduard - Gabriel Munteanu +M: eduard.munteanu@linux360.ro +L: linux-kernel@vger.kernel.org +S: Maintained + KPROBES P: Ananth N Mavinakayanahalli M: ananth@in.ibm.com diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 6183aeccecf..8b6a8a554af 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -22,6 +22,9 @@ config IA64 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_DYNAMIC_FTRACE if (!ITANIUM) + select HAVE_FUNCTION_TRACER select HAVE_DMA_ATTRS select HAVE_KVM select HAVE_ARCH_TRACEHOOK diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h new file mode 100644 index 00000000000..d20db3c2a65 --- /dev/null +++ b/arch/ia64/include/asm/ftrace.h @@ -0,0 +1,28 @@ +#ifndef _ASM_IA64_FTRACE_H +#define _ASM_IA64_FTRACE_H + +#ifdef CONFIG_FUNCTION_TRACER +#define MCOUNT_INSN_SIZE 32 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0); +#define mcount _mcount + +#include <asm/kprobes.h> +/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */ +#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip) +#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip) + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* second bundle, insn 2 */ + return addr - 0x12; +} + +struct dyn_arch_ftrace { +}; +#endif + +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif /* _ASM_IA64_FTRACE_H */ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c381ea95489..ab6e7ec0bba 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -2,6 +2,10 @@ # Makefile for the linux kernel. # +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif + extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ @@ -28,6 +32,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index e5341e2c117..7e3382b06d5 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -47,6 +47,7 @@ #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/unistd.h> +#include <asm/ftrace.h> #include "minstate.h" @@ -1404,6 +1405,105 @@ GLOBAL_ENTRY(unw_init_running) br.ret.sptk.many rp END(unw_init_running) +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +GLOBAL_ENTRY(_mcount) + br ftrace_stub +END(_mcount) + +.here: + br.ret.sptk.many b0 + +GLOBAL_ENTRY(ftrace_caller) + alloc out0 = ar.pfs, 8, 0, 4, 0 + mov out3 = r0 + ;; + mov out2 = b0 + add r3 = 0x20, r3 + mov out1 = r1; + br.call.sptk.many b0 = ftrace_patch_gp + //this might be called from module, so we must patch gp +ftrace_patch_gp: + movl gp=__gp + mov b0 = r3 + ;; +.global ftrace_call; +ftrace_call: +{ + .mlx + nop.m 0x0 + movl r3 = .here;; +} + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(ftrace_caller) + +#else +GLOBAL_ENTRY(_mcount) + movl r2 = ftrace_stub + movl r3 = ftrace_trace_function;; + ld8 r3 = [r3];; + ld8 r3 = [r3];; + cmp.eq p7,p0 = r2, r3 +(p7) br.sptk.many ftrace_stub + ;; + + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(_mcount) +#endif + +GLOBAL_ENTRY(ftrace_stub) + mov r3 = b0 + movl r2 = _mcount_ret_helper + ;; + mov b6 = r2 + mov b7 = r3 + br.ret.sptk.many b6 + +_mcount_ret_helper: + mov b0 = r42 + mov r1 = r41 + mov ar.pfs = r40 + br b7 +END(ftrace_stub) + +#endif /* CONFIG_FUNCTION_TRACER */ + .rodata .align 8 .globl sys_call_table diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c new file mode 100644 index 00000000000..7fc8c961b1f --- /dev/null +++ b/arch/ia64/kernel/ftrace.c @@ -0,0 +1,206 @@ +/* + * Dynamic function tracing support. + * + * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com> + * + * For licencing details, see COPYING. + * + * Defines low-level handling of mcount calls when the kernel + * is compiled with the -pg flag. When using dynamic ftrace, the + * mcount call-sites get patched lazily with NOP till they are + * enabled. All code mutation routines here take effect atomically. + */ + +#include <linux/uaccess.h> +#include <linux/ftrace.h> + +#include <asm/cacheflush.h> +#include <asm/patch.h> + +/* In IA64, each function will be added below two bundles with -pg option */ +static unsigned char __attribute__((aligned(8))) +ftrace_orig_code[MCOUNT_INSN_SIZE] = { + 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */ + 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */ + 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */ + 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */ + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */ + 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */ +}; + +struct ftrace_orig_insn { + u64 dummy1, dummy2, dummy3; + u64 dummy4:64-41+13; + u64 imm20:20; + u64 dummy5:3; + u64 sign:1; + u64 dummy6:4; +}; + +/* mcount stub will be converted below for nop */ +static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */ + 0x00, 0x00, 0x04, 0x00 +}; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop_code; +} + +/* + * mcount stub will be converted below for call + * Note: Just the last instruction is changed against nop + * */ +static unsigned char __attribute__((aligned(8))) +ftrace_call_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/ + 0xf8, 0xff, 0xff, 0xc8 +}; + +struct ftrace_call_insn { + u64 dummy1, dummy2; + u64 dummy3:48; + u64 imm39_l:16; + u64 imm39_h:23; + u64 dummy4:13; + u64 imm20:20; + u64 dummy5:3; + u64 i:1; + u64 dummy6:4; +}; + +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + struct ftrace_call_insn *code = (void *)ftrace_call_code; + unsigned long offset = addr - (ip + 0x10); + + code->imm39_l = offset >> 24; + code->imm39_h = offset >> 40; + code->imm20 = offset >> 4; + code->i = offset >> 63; + return ftrace_call_code; +} + +static int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code, int do_check) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + if (!do_check) + goto skip_check; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + +skip_check: + /* replace the text with the new text */ + if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + flush_icache_range(ip, ip + MCOUNT_INSN_SIZE); + + return 0; +} + +static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE]; + unsigned long ip = rec->ip; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + if (rec->flags & FTRACE_FL_CONVERTED) { + struct ftrace_call_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_call_code; + tmp_call = (void *)replaced; + call_insn->imm39_l = tmp_call->imm39_l; + call_insn->imm39_h = tmp_call->imm39_h; + call_insn->imm20 = tmp_call->imm20; + call_insn->i = tmp_call->i; + if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } else { + struct ftrace_orig_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_orig_code; + tmp_call = (void *)replaced; + call_insn->sign = tmp_call->sign; + call_insn->imm20 = tmp_call->imm20; + if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + int ret; + char *new; + + ret = ftrace_make_nop_check(rec, addr); + if (ret) + return ret; + new = ftrace_nop_replace(); + return ftrace_modify_code(rec->ip, NULL, new, 0); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned char *old, *new; + + old= ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new, 1); +} + +/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */ +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + unsigned long addr = ((struct fnptr *)ftrace_call)->ip; + + if (func == ftrace_stub) + return 0; + ip = ((struct fnptr *)func)->ip; + + ia64_patch_imm64(addr + 2, ip); + + flush_icache_range(addr, addr + 16); + return 0; +} + +/* run from kstop_machine */ +int __init ftrace_dyn_arch_init(void *data) +{ + *(unsigned long *)data = 0; + + return 0; +} diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 6da1f20d737..2d311864e35 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -112,3 +112,9 @@ EXPORT_SYMBOL_GPL(esi_call_phys); #endif extern char ia64_ivt[]; EXPORT_SYMBOL(ia64_ivt); + +#include <asm/ftrace.h> +#ifdef CONFIG_FUNCTION_TRACER +/* mcount is defined in assembly */ +EXPORT_SYMBOL(_mcount); +#endif diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 10d6cc3fd05..e1983fa025d 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -174,28 +174,8 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE - bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && PCI - select TRACING - help - Mmiotrace traces Memory Mapped I/O access and is meant for - debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. Tracing is disabled by - default and can be enabled at run-time. - - See Documentation/tracers/mmiotrace.txt. - If you are not helping to develop drivers, say N. - -config MMIOTRACE_TEST - tristate "Test module for mmiotrace" - depends on MMIOTRACE && m - help - This is a dumb module for testing mmiotrace. It is very dangerous - as it will write garbage to IO memory starting at a given address. - However, it should be safe to use on e.g. unused portion of VRAM. - - Say N, unless you absolutely know what you are doing. +config HAVE_MMIOTRACE_SUPPORT + def_bool y # # IO delay types: diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6b1f6f6f866..077c9ea655f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -14,6 +14,7 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/sysfs.h> +#include <linux/ftrace.h> #include <asm/stacktrace.h> @@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void) int cpu; unsigned long flags; + /* notify the hw-branch tracer so it may disable tracing and + add the last trace to the trace buffer - + the earlier this happens, the more useful the trace. */ + trace_hw_branch_oops(); + oops_enter(); /* racy, but better than risking deadlock. */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1b43086b097..4d33224c055 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -133,15 +133,14 @@ void ftrace_nmi_exit(void) static void wait_for_nmi(void) { - int waited = 0; + if (!atomic_read(&in_nmi)) + return; - while (atomic_read(&in_nmi)) { - waited = 1; + do { cpu_relax(); - } + } while(atomic_read(&in_nmi)); - if (waited) - nmi_wait_count++; + nmi_wait_count++; } static int diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b81125f0bde..c7da3683f4c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -55,7 +55,8 @@ config KVM_AMD config KVM_TRACE bool "KVM trace support" - depends on KVM && MARKERS && SYSFS + depends on KVM && SYSFS + select MARKERS select RELAY select DEBUG_FS default n diff --git a/block/Kconfig b/block/Kconfig index 0cbb3b88b59..7cdaa1d7225 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -50,6 +50,8 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS select TRACEPOINTS + select TRACING + select STACKTRACE help Say Y here if you want to be able to trace the block layer actions on a given queue. Tracing allows you to see any traffic happening @@ -58,6 +60,12 @@ config BLK_DEV_IO_TRACE git://git.kernel.dk/blktrace.git + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/debug/tracing/current_tracer + cat /sys/kernel/debug/tracing/trace_pipe + If unsure, say N. config BLK_DEV_BSG diff --git a/block/blktrace.c b/block/blktrace.c index 39cc3bfe56e..c7698d1617a 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -24,10 +24,28 @@ #include <linux/debugfs.h> #include <linux/time.h> #include <trace/block.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <../kernel/trace/trace_output.h> static unsigned int blktrace_seq __read_mostly = 1; +static struct trace_array *blk_tr; +static int __read_mostly blk_tracer_enabled; + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + /* Global reference count of probes */ static DEFINE_MUTEX(blk_probe_mutex); static atomic_t blk_probes_ref = ATOMIC_INIT(0); @@ -43,6 +61,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, { struct blk_io_trace *t; + if (!bt->rchan) + return; + t = relay_reserve(bt->rchan, sizeof(*t) + len); if (t) { const int cpu = smp_processor_id(); @@ -90,6 +111,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) unsigned long flags; char *buf; + if (blk_tr) { + va_start(args, fmt); + ftrace_vprintk(fmt, args); + va_end(args); + return; + } + + if (!bt->msg_data) + return; + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); @@ -117,11 +148,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, /* * Data direction bit lookup */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) ) +#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -131,13 +163,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int rw, u32 what, int error, int pdu_len, void *pdu_data) { struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; struct blk_io_trace *t; unsigned long flags; unsigned long *sequence; pid_t pid; - int cpu; + int cpu, pc = 0; - if (unlikely(bt->trace_state != Blktrace_running)) + if (unlikely(bt->trace_state != Blktrace_running || + !blk_tracer_enabled)) return; what |= ddir_act[rw & WRITE]; @@ -150,6 +184,24 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid = tsk->pid; if (unlikely(act_log_check(bt, what, sector, pid))) return; + cpu = raw_smp_processor_id(); + + if (blk_tr) { + struct trace_entry *ent; + tracing_record_cmdline(current); + + event = ring_buffer_lock_reserve(blk_tr->buffer, + sizeof(*t) + pdu_len, &flags); + if (!event) + return; + + ent = ring_buffer_event_data(event); + t = (struct blk_io_trace *)ent; + pc = preempt_count(); + tracing_generic_entry_update(ent, 0, pc); + ent->type = TRACE_BLK; + goto record_it; + } /* * A word about the locking here - we disable interrupts to reserve @@ -163,23 +215,40 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { - cpu = smp_processor_id(); sequence = per_cpu_ptr(bt->sequence, cpu); t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); +record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + t->sector = sector; t->bytes = bytes; t->action = what; - t->pid = pid; t->device = bt->dev; - t->cpu = cpu; t->error = error; t->pdu_len = pdu_len; if (pdu_len) memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + + if (blk_tr) { + ring_buffer_unlock_commit(blk_tr->buffer, event, flags); + if (pid != 0 && + !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) && + (trace_flags & TRACE_ITER_STACKTRACE) != 0) + __trace_stack(blk_tr, NULL, flags, 5, pc); + trace_wake_up(); + return; + } } local_irq_restore(flags); @@ -385,7 +454,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, atomic_set(&bt->dropped, 0); ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, + &blk_dropped_fops); if (!bt->dropped_file) goto err; @@ -467,10 +537,10 @@ EXPORT_SYMBOL_GPL(blk_trace_setup); int blk_trace_startstop(struct request_queue *q, int start) { - struct blk_trace *bt; int ret; + struct blk_trace *bt = q->blk_trace; - if ((bt = q->blk_trace) == NULL) + if (bt == NULL) return -EINVAL; /* @@ -606,12 +676,14 @@ static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) blk_add_trace_rq(q, rq, BLK_TA_ISSUE); } -static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_requeue(struct request_queue *q, + struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_complete(struct request_queue *q, + struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -648,12 +720,14 @@ static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); } -static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_backmerge(struct request_queue *q, + struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); } -static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_frontmerge(struct request_queue *q, + struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); } @@ -663,7 +737,8 @@ static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) blk_add_trace_bio(q, bio, BLK_TA_QUEUE); } -static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) +static void blk_add_trace_getrq(struct request_queue *q, + struct bio *bio, int rw) { if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ); @@ -676,7 +751,8 @@ static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw } -static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) +static void blk_add_trace_sleeprq(struct request_queue *q, + struct bio *bio, int rw) { if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); @@ -684,7 +760,8 @@ static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + 0, 0, NULL); } } @@ -858,3 +935,617 @@ static void blk_unregister_tracepoints(void) tracepoint_synchronize_unregister(); } + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + + if (t->action & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (t->action & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (t->action & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (t->action & BLK_TC_BARRIER) + rwbs[i++] = 'B'; + if (t->action & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (t->action & BLK_TC_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent) + 1; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static __u64 get_pdu_int(const struct trace_entry *ent) +{ + const __u64 *val = pdu_start(ent); + return be64_to_cpu(*val); +} + +static void get_pdu_remap(const struct trace_entry *ent, + struct blk_io_trace_remap *r) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent); + __u64 sector = __r->sector; + + r->device = be32_to_cpu(__r->device); + r->device_from = be32_to_cpu(__r->device_from); + r->sector = be64_to_cpu(sector); +} + +static int blk_log_action_iter(struct trace_iterator *iter, const char *act) +{ + char rwbs[6]; + unsigned long long ts = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(ts, USEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct trace_entry *ent = iter->ent; + const struct blk_io_trace *t = (const struct blk_io_trace *)ent; + + fill_rwbs(rwbs, t); + + return trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, usec_rem, ent->pid, act, rwbs); +} + +static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, + const char *act) +{ + char rwbs[6]; + fill_rwbs(rwbs, t); + return trace_seq_printf(s, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *cmd = trace_find_cmdline(ent->pid); + + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); +} + +static int blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent) +{ + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); +} + +static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +{ + struct blk_io_trace_remap r = { .device = 0, }; + + get_pdu_remap(ent, &r); + return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), + t_sec(ent), MAJOR(r.device), MINOR(r.device), + (unsigned long long)r.sector); +} + +static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); +} + +static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), + get_pdu_int(ent)); +} + +static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), trace_find_cmdline(ent->pid)); +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + tracing_reset_online_cpus(tr); + + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) + if (blk_register_tracepoints()) + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled++; + mutex_unlock(&blk_probe_mutex); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + trace_flags |= TRACE_ITER_CONTEXT_INFO; + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + if (!atomic_read(&blk_probes_ref)) + return; + + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled--; + WARN_ON(blk_tracer_enabled < 0); + mutex_unlock(&blk_probe_mutex); + + blk_tracer_stop(tr); +} + +static struct { + const char *act[2]; + int (*print)(struct trace_seq *s, const struct trace_entry *ent); +} what2act[] __read_mostly = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags) +{ + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); + int ret; + + if (!trace_print_context(iter)) + return TRACE_TYPE_PARTIAL_LINE; + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(s, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(s, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = ns2usecs(iter->ts), + }; + + if (!trace_seq_putmem(s, &old, offset)) + return 0; + return trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +{ + return blk_trace_synthesize_old_trace(iter) ? + TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + const struct blk_io_trace *t; + u16 what; + int ret; + + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + t = (const struct blk_io_trace *)iter->ent; + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_iter(iter, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(&iter->seq, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .trace = blk_trace_event_print, + .latency_trace = blk_trace_event_print, + .raw = trace_nop_print, + .hex = trace_nop_print, + .binary = blk_trace_event_print_binary, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_ftrace_event(&trace_blk_event)) { + pr_warning("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warning("Warning: could not register the block tracer\n"); + unregister_ftrace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (bt == NULL) + return -EINVAL; + + kfree(bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +{ + struct blk_trace *old_bt, *bt = NULL; + int ret; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->dev = dev; + bt->act_mask = (u16)-1; + bt->end_lba = -1ULL; + bt->trace_state = Blktrace_running; + + old_bt = xchg(&q->blk_trace, bt); + if (old_bt != NULL) { + (void)xchg(&q->blk_trace, old_bt); + kfree(bt); + ret = -EBUSY; + } + return 0; +err: + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_enable_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev != NULL) { + struct request_queue *q = bdev_get_queue(bdev); + + if (q != NULL) { + mutex_lock(&bdev->bd_mutex); + ret = sprintf(buf, "%u\n", !!q->blk_trace); + mutex_unlock(&bdev->bd_mutex); + } + + bdput(bdev); + } + + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + int value; + ssize_t ret = -ENXIO; + + if (count == 0 || sscanf(buf, "%d", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + mutex_unlock(&bdev->bd_mutex); + + if (ret == 0) + ret = count; +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, + sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static int blk_str2act_mask(const char *str) +{ + int mask = 0; + char *copy = kstrdup(str, GFP_KERNEL), *s; + + if (copy == NULL) + return -ENOMEM; + + s = strstrip(copy); + + while (1) { + char *sep = strchr(s, ','); + + if (sep != NULL) + *sep = '\0'; + + if (strcasecmp(s, "barrier") == 0) + mask |= BLK_TC_BARRIER; + else if (strcasecmp(s, "complete") == 0) + mask |= BLK_TC_COMPLETE; + else if (strcasecmp(s, "fs") == 0) + mask |= BLK_TC_FS; + else if (strcasecmp(s, "issue") == 0) + mask |= BLK_TC_ISSUE; + else if (strcasecmp(s, "pc") == 0) + mask |= BLK_TC_PC; + else if (strcasecmp(s, "queue") == 0) + mask |= BLK_TC_QUEUE; + else if (strcasecmp(s, "read") == 0) + mask |= BLK_TC_READ; + else if (strcasecmp(s, "requeue") == 0) + mask |= BLK_TC_REQUEUE; + else if (strcasecmp(s, "sync") == 0) + mask |= BLK_TC_SYNC; + else if (strcasecmp(s, "write") == 0) + mask |= BLK_TC_WRITE; + + if (sep == NULL) + break; + + s = sep + 1; + } + kfree(copy); + + return mask; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q; + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + mutex_lock(&bdev->bd_mutex); + if (q->blk_trace == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", q->blk_trace->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + u64 value; + ssize_t ret = -ENXIO; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (sscanf(buf, "%llx", &value) != 1) { + /* Assume it is a list of trace category names */ + value = blk_str2act_mask(buf); + if (value < 0) + goto out; + } + } else if (sscanf(buf, "%llu", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + ret = 0; + if (q->blk_trace == NULL) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + q->blk_trace->act_mask = value; + else if (attr == &dev_attr_pid) + q->blk_trace->pid = value; + else if (attr == &dev_attr_start_lba) + q->blk_trace->start_lba = value; + else if (attr == &dev_attr_end_lba) + q->blk_trace->end_lba = value; + ret = count; + } + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 33a9351c896..30659ce9bcf 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -283,7 +283,7 @@ static void sysrq_ftrace_dump(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_ftrace_dump_op = { .handler = sysrq_ftrace_dump, - .help_msg = "dumpZ-ftrace-buffer", + .help_msg = "dump-ftrace-buffer(Z)", .action_msg = "Dump ftrace buffer", .enable_mask = SYSRQ_ENABLE_DUMP, }; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6d720243f5f..8a17f7edcc7 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -19,6 +19,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/blktrace_api.h> #include "check.h" @@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = { static struct attribute_group *part_attr_groups[] = { &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif NULL }; diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 25379cba237..ed12e8fd8cf 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -143,6 +143,9 @@ struct blk_user_trace_setup { #ifdef __KERNEL__ #if defined(CONFIG_BLK_DEV_IO_TRACE) + +#include <linux/sysfs.h> + struct blk_trace { int trace_state; struct rchan *rchan; @@ -193,6 +196,8 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); +extern struct attribute_group blk_trace_attr_group; + #else /* !CONFIG_BLK_DEV_IO_TRACE */ #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) #define blk_trace_shutdown(q) do { } while (0) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 677432b9cb7..7840e718c6c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -126,6 +126,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); + +#ifndef FTRACE_ADDR +#define FTRACE_ADDR ((unsigned long)ftrace_caller) +#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern void ftrace_graph_caller(void); extern int ftrace_enable_ftrace_graph_caller(void); @@ -298,6 +302,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); extern int __ftrace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap) +extern int +__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); extern void ftrace_dump(void); #else static inline void @@ -313,6 +320,11 @@ ftrace_printk(const char *fmt, ...) { return 0; } +static inline int +ftrace_vprintk(const char *fmt, va_list ap) +{ + return 0; +} static inline void ftrace_dump(void) { } #endif @@ -492,4 +504,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) #endif /* CONFIG_TRACING */ + +#ifdef CONFIG_HW_BRANCH_TRACER + +void trace_hw_branch(u64 from, u64 to); +void trace_hw_branch_oops(void); + +#else /* CONFIG_HW_BRANCH_TRACER */ + +static inline void trace_hw_branch(u64 from, u64 to) {} +static inline void trace_hw_branch_oops(void) {} + +#endif /* CONFIG_HW_BRANCH_TRACER */ + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a7c7638873..f3c23cf11ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern unsigned long get_parent_ip(unsigned long addr); + struct seq_file; struct cfs_rq; struct task_group; diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 6ca6a7b66d7..f4523651fa4 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,6 +14,7 @@ #include <asm/page.h> /* kmalloc_sizes.h needs PAGE_SIZE */ #include <asm/cache.h> /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include <linux/compiler.h> +#include <trace/kmemtrace.h> /* Size description struct for general caches. */ struct cache_sizes { @@ -28,8 +29,26 @@ extern struct cache_sizes malloc_sizes[]; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); -static inline void *kmalloc(size_t size, gfp_t flags) +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags); +extern size_t slab_buffer_size(struct kmem_cache *cachep); +#else +static __always_inline void * +kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) { + return kmem_cache_alloc(cachep, flags); +} +static inline size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return 0; +} +#endif + +static __always_inline void *kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *cachep; + void *ret; + if (__builtin_constant_p(size)) { int i = 0; @@ -47,10 +66,17 @@ static inline void *kmalloc(size_t size, gfp_t flags) found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) - return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, - flags); + cachep = malloc_sizes[i].cs_dmacachep; + else #endif - return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags); + cachep = malloc_sizes[i].cs_cachep; + + ret = kmem_cache_alloc_notrace(cachep, flags); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret, + size, slab_buffer_size(cachep), flags); + + return ret; } return __kmalloc(size, flags); } @@ -59,8 +85,25 @@ found: extern void *__kmalloc_node(size_t size, gfp_t flags, int node); extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); -static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid); +#else +static __always_inline void * +kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return kmem_cache_alloc_node(cachep, flags, nodeid); +} +#endif + +static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + struct kmem_cache *cachep; + void *ret; + if (__builtin_constant_p(size)) { int i = 0; @@ -78,11 +121,18 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) - return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep, - flags, node); + cachep = malloc_sizes[i].cs_dmacachep; + else #endif - return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep, - flags, node); + cachep = malloc_sizes[i].cs_cachep; + + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, + ret, size, slab_buffer_size(cachep), + flags, node); + + return ret; } return __kmalloc_node(size, flags, node); } diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h index 59a3fa476ab..0ec00b39d00 100644 --- a/include/linux/slob_def.h +++ b/include/linux/slob_def.h @@ -3,14 +3,15 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); -static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) { return kmem_cache_alloc_node(cachep, flags, -1); } void *__kmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc_node(size, flags, node); } @@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) * kmalloc is the normal method of allocating memory * in the kernel. */ -static inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline void *kmalloc(size_t size, gfp_t flags) { return __kmalloc_node(size, flags, -1); } -static inline void *__kmalloc(size_t size, gfp_t flags) +static __always_inline void *__kmalloc(size_t size, gfp_t flags) { return kmalloc(size, flags); } diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b1aac..6b657f7dcb2 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,6 +10,7 @@ #include <linux/gfp.h> #include <linux/workqueue.h> #include <linux/kobject.h> +#include <trace/kmemtrace.h> enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -204,13 +205,31 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size) void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags); +#else +static __always_inline void * +kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return kmem_cache_alloc(s, gfpflags); +} +#endif + static __always_inline void *kmalloc_large(size_t size, gfp_t flags) { - return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size)); + unsigned int order = get_order(size); + void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret, + size, PAGE_SIZE << order, flags); + + return ret; } static __always_inline void *kmalloc(size_t size, gfp_t flags) { + void *ret; + if (__builtin_constant_p(size)) { if (size > PAGE_SIZE) return kmalloc_large(size, flags); @@ -221,7 +240,13 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) if (!s) return ZERO_SIZE_PTR; - return kmem_cache_alloc(s, flags); + ret = kmem_cache_alloc_notrace(s, flags); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + _THIS_IP_, ret, + size, s->size, flags); + + return ret; } } return __kmalloc(size, flags); @@ -231,8 +256,24 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) void *__kmalloc_node(size_t size, gfp_t flags, int node); void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node); +#else +static __always_inline void * +kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return kmem_cache_alloc_node(s, gfpflags, node); +} +#endif + static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + void *ret; + if (__builtin_constant_p(size) && size <= PAGE_SIZE && !(flags & SLUB_DMA)) { struct kmem_cache *s = kmalloc_slab(size); @@ -240,7 +281,13 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) if (!s) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node(s, flags, node); + ret = kmem_cache_alloc_node_notrace(s, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _THIS_IP_, ret, + size, s->size, flags, node); + + return ret; } return __kmalloc_node(size, flags, node); } diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h new file mode 100644 index 00000000000..ad8b7857855 --- /dev/null +++ b/include/trace/kmemtrace.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include <linux/types.h> +#include <linux/marker.h> + +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + +#ifdef CONFIG_KMEMTRACE + +extern void kmemtrace_init(void); + +extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node); + +extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr); + +#else /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_init(void) +{ +} + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ +} + +#endif /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_mark_alloc_node(type_id, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h new file mode 100644 index 00000000000..867829df457 --- /dev/null +++ b/include/trace/workqueue.h @@ -0,0 +1,25 @@ +#ifndef __TRACE_WORKQUEUE_H +#define __TRACE_WORKQUEUE_H + +#include <linux/tracepoint.h> +#include <linux/workqueue.h> +#include <linux/sched.h> + +DECLARE_TRACE(workqueue_insertion, + TPPROTO(struct task_struct *wq_thread, struct work_struct *work), + TPARGS(wq_thread, work)); + +DECLARE_TRACE(workqueue_execution, + TPPROTO(struct task_struct *wq_thread, struct work_struct *work), + TPARGS(wq_thread, work)); + +/* Trace the creation of one workqueue thread on a cpu */ +DECLARE_TRACE(workqueue_creation, + TPPROTO(struct task_struct *wq_thread, int cpu), + TPARGS(wq_thread, cpu)); + +DECLARE_TRACE(workqueue_destruction, + TPPROTO(struct task_struct *wq_thread), + TPARGS(wq_thread)); + +#endif /* __TRACE_WORKQUEUE_H */ diff --git a/init/main.c b/init/main.c index 844209453c0..db7974ff7a0 100644 --- a/init/main.c +++ b/init/main.c @@ -70,6 +70,7 @@ #include <asm/setup.h> #include <asm/sections.h> #include <asm/cacheflush.h> +#include <trace/kmemtrace.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> @@ -641,6 +642,7 @@ asmlinkage void __init start_kernel(void) enable_debug_pagealloc(); cpu_hotplug_init(); kmem_cache_init(); + kmemtrace_init(); debug_objects_mem_init(); idr_init_cache(); setup_per_cpu_pageset(); diff --git a/kernel/relay.c b/kernel/relay.c index 9d79b7854fa..edc0ba6d816 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -677,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan, */ for_each_online_cpu(i) { if (unlikely(!chan->buf[i])) { - printk(KERN_ERR "relay_late_setup_files: CPU %u " - "has no buffer, it must have!\n", i); - BUG(); + WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); err = -EINVAL; break; } diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70..566c8c9e3a6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4409,10 +4409,7 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) +unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; @@ -4422,6 +4419,9 @@ static inline unsigned long get_parent_ip(unsigned long addr) return addr; } +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8..6edfc2c11d9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -21,6 +21,7 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/rcupdate.h> +#include <linux/ftrace.h> #include <linux/smp.h> #include <linux/tick.h> @@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip) WARN_ON_ONCE(in_irq()); raw_local_irq_save(flags); - add_preempt_count(SOFTIRQ_OFFSET); + /* + * The preempt tracer hooks into add_preempt_count and will break + * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET + * is set and before current->softirq_enabled is cleared. + * We must manually increment preempt_count here and manually + * call the trace_preempt_off later. + */ + preempt_count() += SOFTIRQ_OFFSET; /* * Were softirqs turned off above: */ if (softirq_count() == SOFTIRQ_OFFSET) trace_softirqs_off(ip); raw_local_irq_restore(flags); + + if (preempt_count() == SOFTIRQ_OFFSET) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ static inline void __local_bh_disable(unsigned long ip) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e2a4ff6fc3a..28f2644484d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,9 +164,8 @@ config BOOT_TRACER representation of the delays during initcalls - but the raw /debug/tracing/trace text output is readable too. - ( Note that tracing self tests can't be enabled if this tracer is - selected, because the self-tests are an initcall as well and that - would invalidate the boot trace. ) + You must pass in ftrace=initcall to the kernel command line + to enable this on bootup. config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" @@ -264,6 +263,38 @@ config HW_BRANCH_TRACER This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. +config KMEMTRACE + bool "Trace SLAB allocations" + select TRACING + help + kmemtrace provides tracing for slab allocator functions, such as + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + data is then fed to the userspace application in order to analyse + allocation hotspots, internal fragmentation and so on, making it + possible to see how well an allocator performs, as well as debug + and profile kernel code. + + This requires an userspace application to use. See + Documentation/vm/kmemtrace.txt for more information. + + Saying Y will make the kernel somewhat larger and slower. However, + if you disable kmemtrace at run-time or boot-time, the performance + impact is minimal (depending on the arch the kernel is built for). + + If unsure, say N. + +config WORKQUEUE_TRACER + bool "Trace workqueues" + select TRACING + help + The workqueue tracer provides some statistical informations + about each cpu workqueue thread such as the number of the + works inserted and executed since their creation. It can help + to evaluate the amount of work each of them have to perform. + For example it can help a developer to decide whether he should + choose a per cpu workqueue instead of a singlethreaded one. + + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER @@ -294,7 +325,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER + depends on TRACING && DEBUG_KERNEL select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -302,4 +333,27 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + select TRACING + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/tracers/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + endmenu diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 349d5a93653..f76d48f3527 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,6 +19,8 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o @@ -33,5 +35,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o +obj-$(CONFIG_KMEMTRACE) += kmemtrace.o +obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7dcf6e9f2b0..68610031780 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -264,14 +264,6 @@ static void ftrace_update_pid_func(void) # error Dynamic ftrace depends on MCOUNT_RECORD #endif -/* - * Since MCOUNT_ADDR may point to mcount itself, we do not want - * to get it confused by reading a reference in the code as we - * are parsing on objcopy output of text. Use a variable for - * it instead. - */ -static unsigned long mcount_addr = MCOUNT_ADDR; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -290,7 +282,7 @@ static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; - unsigned long index; + int index; struct dyn_ftrace records[]; }; @@ -464,7 +456,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ip, fl; unsigned long ftrace_addr; - ftrace_addr = (unsigned long)ftrace_caller; + ftrace_addr = (unsigned long)FTRACE_ADDR; ip = rec->ip; @@ -576,7 +568,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ip = rec->ip; - ret = ftrace_make_nop(mod, rec, mcount_addr); + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { ftrace_bug(ret, ip); rec->flags |= FTRACE_FL_FAILED; @@ -787,7 +779,7 @@ enum { struct ftrace_iterator { struct ftrace_page *pg; - unsigned idx; + int idx; unsigned flags; unsigned char buffer[FTRACE_BUFF_MAX+1]; unsigned buffer_idx; @@ -1737,9 +1729,12 @@ static void clear_ftrace_pid(struct pid *pid) { struct task_struct *p; + rcu_read_lock(); do_each_pid_task(pid, PIDTYPE_PID, p) { clear_tsk_trace_trace(p); } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); + put_pid(pid); } @@ -1747,9 +1742,11 @@ static void set_ftrace_pid(struct pid *pid) { struct task_struct *p; + rcu_read_lock(); do_each_pid_task(pid, PIDTYPE_PID, p) { set_tsk_trace_trace(p); } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); } static void clear_ftrace_pid_task(struct pid **pid) @@ -1903,7 +1900,7 @@ int register_ftrace_function(struct ftrace_ops *ops) } /** - * unregister_ftrace_function - unresgister a function for profiling. + * unregister_ftrace_function - unregister a function for profiling. * @ops - ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c new file mode 100644 index 00000000000..f04c0625f1c --- /dev/null +++ b/kernel/trace/kmemtrace.c @@ -0,0 +1,350 @@ +/* + * Memory allocator tracing + * + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + */ + +#include <linux/dcache.h> +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <trace/kmemtrace.h> + +#include "trace.h" +#include "trace_output.h" + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_KMEM_OPT_MINIMAL 0x1 + +static struct tracer_opt kmem_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, + { } +}; + +static struct tracer_flags kmem_tracer_flags = { + .val = 0, + .opts = kmem_opts +}; + + +static bool kmem_tracing_enabled __read_mostly; +static struct trace_array *kmemtrace_array; + +static int kmem_trace_init(struct trace_array *tr) +{ + int cpu; + kmemtrace_array = tr; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); + + kmem_tracing_enabled = true; + + return 0; +} + +static void kmem_trace_reset(struct trace_array *tr) +{ + kmem_tracing_enabled = false; +} + +static void kmemtrace_headers(struct seq_file *s) +{ + /* Don't need headers for the original kmemtrace output */ + if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) + return; + + seq_printf(s, "#\n"); + seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " + " POINTER NODE CALLER\n"); + seq_printf(s, "# FREE | | | | " + " | | | |\n"); + seq_printf(s, "# |\n\n"); +} + +/* + * The two following functions give the original output from kmemtrace, + * or something close to....perhaps they need some missing things + */ +static enum print_line_t +kmemtrace_print_alloc_original(struct trace_iterator *iter, + struct kmemtrace_alloc_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Taken from the old linux/kmemtrace.h */ + ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", + entry->type_id, entry->call_site, (unsigned long) entry->ptr, + (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc, + (unsigned long) entry->gfp_flags, entry->node); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_original(struct trace_iterator *iter, + struct kmemtrace_free_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Taken from the old linux/kmemtrace.h */ + ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n", + entry->type_id, entry->call_site, (unsigned long) entry->ptr); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + +/* The two other following provide a more minimalistic output */ +static enum print_line_t +kmemtrace_print_alloc_compress(struct trace_iterator *iter, + struct kmemtrace_alloc_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Alloc entry */ + ret = trace_seq_printf(s, " + "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Type */ + switch (entry->type_id) { + case KMEMTRACE_TYPE_KMALLOC: + ret = trace_seq_printf(s, "K "); + break; + case KMEMTRACE_TYPE_CACHE: + ret = trace_seq_printf(s, "C "); + break; + case KMEMTRACE_TYPE_PAGES: + ret = trace_seq_printf(s, "P "); + break; + default: + ret = trace_seq_printf(s, "? "); + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Requested */ + ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Allocated */ + ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Flags + * TODO: would be better to see the name of the GFP flag names + */ + ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Pointer to allocated */ + ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Node */ + ret = trace_seq_printf(s, "%4d ", entry->node); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Call site */ + ret = seq_print_ip_sym(s, entry->call_site, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (!trace_seq_printf(s, "\n")) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_compress(struct trace_iterator *iter, + struct kmemtrace_free_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Free entry */ + ret = trace_seq_printf(s, " - "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Type */ + switch (entry->type_id) { + case KMEMTRACE_TYPE_KMALLOC: + ret = trace_seq_printf(s, "K "); + break; + case KMEMTRACE_TYPE_CACHE: + ret = trace_seq_printf(s, "C "); + break; + case KMEMTRACE_TYPE_PAGES: + ret = trace_seq_printf(s, "P "); + break; + default: + ret = trace_seq_printf(s, "? "); + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Skip requested/allocated/flags */ + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Pointer to allocated */ + ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Skip node */ + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Call site */ + ret = seq_print_ip_sym(s, entry->call_site, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (!trace_seq_printf(s, "\n")) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_KMEM_ALLOC: { + struct kmemtrace_alloc_entry *field; + trace_assign_type(field, entry); + if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) + return kmemtrace_print_alloc_compress(iter, field); + else + return kmemtrace_print_alloc_original(iter, field); + } + + case TRACE_KMEM_FREE: { + struct kmemtrace_free_entry *field; + trace_assign_type(field, entry); + if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) + return kmemtrace_print_free_compress(iter, field); + else + return kmemtrace_print_free_original(iter, field); + } + + default: + return TRACE_TYPE_UNHANDLED; + } +} + +/* Trace allocations */ +void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + struct ring_buffer_event *event; + struct kmemtrace_alloc_entry *entry; + struct trace_array *tr = kmemtrace_array; + unsigned long irq_flags; + + if (!kmem_tracing_enabled) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_ALLOC; + entry->call_site = call_site; + entry->ptr = ptr; + entry->bytes_req = bytes_req; + entry->bytes_alloc = bytes_alloc; + entry->gfp_flags = gfp_flags; + entry->node = node; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} +EXPORT_SYMBOL(kmemtrace_mark_alloc_node); + +void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + struct ring_buffer_event *event; + struct kmemtrace_free_entry *entry; + struct trace_array *tr = kmemtrace_array; + unsigned long irq_flags; + + if (!kmem_tracing_enabled) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_FREE; + entry->type_id = type_id; + entry->call_site = call_site; + entry->ptr = ptr; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} +EXPORT_SYMBOL(kmemtrace_mark_free); + +static struct tracer kmem_tracer __read_mostly = { + .name = "kmemtrace", + .init = kmem_trace_init, + .reset = kmem_trace_reset, + .print_line = kmemtrace_print_line, + .print_header = kmemtrace_headers, + .flags = &kmem_tracer_flags +}; + +void kmemtrace_init(void) +{ + /* earliest opportunity to start kmem tracing */ +} + +static int __init init_kmem_tracer(void) +{ + return register_tracer(&kmem_tracer); +} + +device_initcall(init_kmem_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd38c5cfd8a..b36d7374cee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -123,8 +123,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) -#define RB_ALIGNMENT_SHIFT 2 -#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +#define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA 28 enum { @@ -133,7 +132,7 @@ enum { }; /* inline for ring buffer fast paths */ -static inline unsigned +static unsigned rb_event_length(struct ring_buffer_event *event) { unsigned length; @@ -151,7 +150,7 @@ rb_event_length(struct ring_buffer_event *event) case RINGBUF_TYPE_DATA: if (event->len) - length = event->len << RB_ALIGNMENT_SHIFT; + length = event->len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -179,7 +178,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ -static inline void * +static void * rb_event_data(struct ring_buffer_event *event) { BUG_ON(event->type != RINGBUF_TYPE_DATA); @@ -229,10 +228,9 @@ static void rb_init_page(struct buffer_data_page *bpage) * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. */ -static inline void free_buffer_page(struct buffer_page *bpage) +static void free_buffer_page(struct buffer_page *bpage) { - if (bpage->page) - free_page((unsigned long)bpage->page); + free_page((unsigned long)bpage->page); kfree(bpage); } @@ -811,7 +809,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static inline int +static int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -825,7 +823,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static inline void +static void rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -850,7 +848,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, local_set(&cpu_buffer->commit_page->page->commit, index); } -static inline void +static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { /* @@ -896,7 +894,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; } -static inline void rb_inc_iter(struct ring_buffer_iter *iter) +static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -926,7 +924,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) * and with this, we can determine what to place into the * data field. */ -static inline void +static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { @@ -938,15 +936,11 @@ rb_update_event(struct ring_buffer_event *event, break; case RINGBUF_TYPE_TIME_EXTEND: - event->len = - (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); break; case RINGBUF_TYPE_TIME_STAMP: - event->len = - (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; case RINGBUF_TYPE_DATA: @@ -955,16 +949,14 @@ rb_update_event(struct ring_buffer_event *event, event->len = 0; event->array[0] = length; } else - event->len = - (length + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); } } -static inline unsigned rb_calculate_event_length(unsigned length) +static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1438,7 +1430,7 @@ int ring_buffer_write(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = cpu_buffer->head_page; @@ -2277,9 +2269,24 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, if (buffer_a->pages != buffer_b->pages) return -EINVAL; + if (ring_buffer_flags != RB_BUFFERS_ON) + return -EAGAIN; + + if (atomic_read(&buffer_a->record_disabled)) + return -EAGAIN; + + if (atomic_read(&buffer_b->record_disabled)) + return -EAGAIN; + cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + if (atomic_read(&cpu_buffer_a->record_disabled)) + return -EAGAIN; + + if (atomic_read(&cpu_buffer_b->record_disabled)) + return -EAGAIN; + /* * We can't do a synchronize_sched here because this * function can be called in atomic context. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 17bb88d86ac..fd51cf0b94c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -37,6 +37,7 @@ #include <linux/irqflags.h> #include "trace.h" +#include "trace_output.h" #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) @@ -52,6 +53,11 @@ unsigned long __read_mostly tracing_thresh; */ static bool __read_mostly tracing_selftest_running; +/* + * If a tracer is running, we do not want to run SELFTEST. + */ +static bool __read_mostly tracing_selftest_disabled; + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -109,14 +115,19 @@ static cpumask_var_t __read_mostly tracing_buffer_mask; */ int ftrace_dump_on_oops; -static int tracing_set_tracer(char *buf); +static int tracing_set_tracer(const char *buf); + +#define BOOTUP_TRACER_SIZE 100 +static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +static char *default_bootup_tracer; static int __init set_ftrace(char *str) { - tracing_set_tracer(str); + strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + default_bootup_tracer = bootup_tracer_buf; return 1; } -__setup("ftrace", set_ftrace); +__setup("ftrace=", set_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -186,9 +197,6 @@ int tracing_is_enabled(void) return tracer_enabled; } -/* function tracing enabled */ -int ftrace_function_enabled; - /* * trace_buf_size is the size in bytes that is allocated * for a buffer. Note, the number of bytes is always rounded @@ -229,7 +237,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO; /** * trace_wake_up - wake up tasks waiting for trace input @@ -287,6 +295,7 @@ static const char *trace_options[] = { "userstacktrace", "sym-userobj", "printk-msg-only", + "context-info", NULL }; @@ -329,132 +338,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (!len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) - return 0; - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -static int -trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (len > ((PAGE_SIZE - 1) - s->len)) - return 0; - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -static int -trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->len >= (PAGE_SIZE - 1)) - return 0; - - s->buffer[s->len++] = c; - - return 1; -} - -static int -trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) -{ - if (len > ((PAGE_SIZE - 1) - s->len)) - return 0; - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -#define MAX_MEMHEX_BYTES 8 -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - -static int -trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - unsigned char *data = mem; - int i, j; - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -static int -trace_seq_path(struct trace_seq *s, struct path *path) -{ - unsigned char *p; - - if (s->len >= (PAGE_SIZE - 1)) - return 0; - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; - } - - return 0; -} - static void trace_seq_reset(struct trace_seq *s) { @@ -543,7 +426,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) ftrace_enable_cpu(); - WARN_ON_ONCE(ret); + WARN_ON_ONCE(ret && ret != -EAGAIN); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -596,7 +479,7 @@ int register_tracer(struct tracer *type) type->flags->opts = dummy_tracer_opt; #ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest) { + if (type->selftest && !tracing_selftest_disabled) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; int i; @@ -638,8 +521,25 @@ int register_tracer(struct tracer *type) out: tracing_selftest_running = false; mutex_unlock(&trace_types_lock); - lock_kernel(); + if (!ret && default_bootup_tracer) { + if (!strncmp(default_bootup_tracer, type->name, + BOOTUP_TRACER_SIZE)) { + printk(KERN_INFO "Starting tracer '%s'\n", + type->name); + /* Do we want this tracer to start on bootup? */ + tracing_set_tracer(type->name); + default_bootup_tracer = NULL; + /* disable other selftests, since this will break it. */ + tracing_selftest_disabled = 1; +#ifdef CONFIG_FTRACE_STARTUP_TEST + printk(KERN_INFO "Disabling FTRACE selftests due" + " to running tracer '%s'\n", type->name); +#endif + } + } + + lock_kernel(); return ret; } @@ -738,13 +638,12 @@ void tracing_start(void) return; spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) - goto out; - - if (trace_stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - trace_stop_count = 0; + if (--trace_stop_count) { + if (trace_stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + trace_stop_count = 0; + } goto out; } @@ -960,10 +859,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags, pc); } -static void ftrace_trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long flags, - int skip, int pc) +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) { #ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; @@ -971,9 +870,6 @@ static void ftrace_trace_stack(struct trace_array *tr, struct stack_trace trace; unsigned long irq_flags; - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); if (!event) @@ -994,12 +890,23 @@ static void ftrace_trace_stack(struct trace_array *tr, #endif } +static void ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(tr, data, flags, skip, pc); +} + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, - int skip) + int skip, int pc) { - ftrace_trace_stack(tr, data, flags, skip, preempt_count()); + __ftrace_trace_stack(tr, data, flags, skip, pc); } static void ftrace_trace_userstack(struct trace_array *tr, @@ -1163,65 +1070,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_TRACER -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu, resched; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - resched = ftrace_preempt_disable(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); -} - -static void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - trace_function(tr, data, ip, parent_ip, flags, pc); - } - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - #ifdef CONFIG_FUNCTION_GRAPH_TRACER int trace_graph_entry(struct ftrace_graph_ent *trace) { @@ -1279,31 +1127,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, -}; - -void tracing_start_function_trace(void) -{ - ftrace_function_enabled = 0; - - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - - register_ftrace_function(&trace_ops); - ftrace_function_enabled = 1; -} - -void tracing_stop_function_trace(void) -{ - ftrace_function_enabled = 0; - unregister_ftrace_function(&trace_ops); -} -#endif - enum trace_file_type { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, @@ -1376,8 +1199,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) } /* Find the next real entry, without updating the iterator itself */ -static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts) { return __find_next_entry(iter, ent_cpu, ent_ts); } @@ -1472,154 +1295,6 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } -#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) -{ - static const char tramp_name[] = "kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) - return "[unknown/kretprobe'd]"; - return name; -} -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ - -static int -seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - const char *name; - - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - return trace_seq_printf(s, fmt, name); -#endif - return 1; -} - -static int -seq_print_sym_offset(struct trace_seq *s, const char *fmt, - unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - const char *name; - - sprint_symbol(str, address); - name = kretprobed(str); - - return trace_seq_printf(s, fmt, name); -#endif - return 1; -} - -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - -int -seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) -{ - int ret; - - if (!ip) - return trace_seq_printf(s, "0"); - - if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); - else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; - - if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags) -{ - struct file *file = NULL; - unsigned long vmstart = 0; - int ret = 1; - - if (mm) { - const struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, ip); - if (vma) { - file = vma->vm_file; - vmstart = vma->vm_start; - } - if (file) { - ret = trace_seq_path(s, &file->f_path); - if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); - } - up_read(&mm->mmap_sem); - } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -static int -seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) -{ - struct mm_struct *mm = NULL; - int ret = 1; - unsigned int i; - - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { - struct task_struct *task; - /* - * we do the lookup on the thread group leader, - * since individual threads might have already quit! - */ - rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); - if (task) - mm = get_task_mm(task); - rcu_read_unlock(); - } - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - unsigned long ip = entry->caller[i]; - - if (ip == ULONG_MAX || !ret) - break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); - if (!ip) { - if (ret) - ret = trace_seq_puts(s, "??"); - continue; - } - if (!ret) - break; - if (ret) - ret = seq_print_user_ip(s, mm, ip, sym_flags); - } - - if (mm) - mmput(mm); - return ret; -} - static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1704,103 +1379,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -static void -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) -{ - int hardirq, softirq; - char *comm; - - comm = trace_find_cmdline(entry->pid); - - trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "%3d", cpu); - trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); - - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; - softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (hardirq && softirq) { - trace_seq_putc(s, 'H'); - } else { - if (hardirq) { - trace_seq_putc(s, 'h'); - } else { - if (softirq) - trace_seq_putc(s, 's'); - else - trace_seq_putc(s, '.'); - } - } - - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); - else - trace_seq_puts(s, "."); -} - -unsigned long preempt_mark_thresh = 100; - -static void -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, - unsigned long rel_usecs) -{ - trace_seq_printf(s, " %4lldus", abs_usecs); - if (rel_usecs > preempt_mark_thresh) - trace_seq_puts(s, "!: "); - else if (rel_usecs > 1) - trace_seq_puts(s, "+: "); - else - trace_seq_puts(s, " : "); -} - -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - -/* - * The message is supposed to contain an ending newline. - * If the printing stops prematurely, try to add a newline of our own. - */ -void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) -{ - struct trace_entry *ent; - struct trace_field_cont *cont; - bool ok = true; - - ent = peek_next_entry(iter, iter->cpu, NULL); - if (!ent || ent->type != TRACE_CONT) { - trace_seq_putc(s, '\n'); - return; - } - - do { - cont = (struct trace_field_cont *)ent; - if (ok) - ok = (trace_seq_printf(s, "%s", cont->buf) > 0); - - ftrace_disable_cpu(); - - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - else - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - - ftrace_enable_cpu(); - - ent = peek_next_entry(iter, iter->cpu, NULL); - } while (ent && ent->type == TRACE_CONT); - - if (!ok) - trace_seq_putc(s, '\n'); -} - static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1818,138 +1396,31 @@ static void test_cpu_buff_start(struct trace_iterator *iter) trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } -static enum print_line_t -print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) +static enum print_line_t print_lat_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *next_entry; - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_event *event; struct trace_entry *entry = iter->ent; - unsigned long abs_usecs; - unsigned long rel_usecs; - u64 next_ts; - char *comm; - int S, T; - int i; - - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; test_cpu_buff_start(iter); - next_entry = find_next_entry(iter, NULL, &next_ts); - if (!next_entry) - next_ts = iter->ts; - rel_usecs = ns2usecs(next_ts - iter->ts); - abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); - - if (verbose) { - comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" - " %ld.%03ldms (+%ld.%03ldms): ", - comm, - entry->pid, cpu, entry->flags, - entry->preempt_count, trace_idx, - ns2usecs(iter->ts), - abs_usecs/1000, - abs_usecs % 1000, rel_usecs/1000, - rel_usecs % 1000); - } else { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); - } - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - seq_print_ip_sym(s, field->ip, sym_flags); - trace_seq_puts(s, " ("); - seq_print_ip_sym(s, field->parent_ip, sym_flags); - trace_seq_puts(s, ")\n"); - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); - comm = trace_find_cmdline(field->next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, entry->type == TRACE_CTX ? "==>" : " +", - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm); - break; - } - case TRACE_SPECIAL: { - struct special_entry *field; - - trace_assign_type(field, entry); - - trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3); - break; - } - case TRACE_STACK: { - struct stack_entry *field; - - trace_assign_type(field, entry); + event = ftrace_find_event(entry->type); - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) - trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, field->caller[i], sym_flags); - } - trace_seq_puts(s, "\n"); - break; - } - case TRACE_PRINT: { - struct print_entry *field; - - trace_assign_type(field, entry); - - seq_print_ip_sym(s, field->ip, sym_flags); - trace_seq_printf(s, ": %s", field->buf); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - break; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (!trace_print_lat_context(iter)) + goto partial; } - case TRACE_BRANCH: { - struct trace_branch *field; - trace_assign_type(field, entry); + if (event && event->latency_trace) + return event->latency_trace(iter, sym_flags); - trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line); - break; - } - case TRACE_USER_STACK: { - struct userstack_entry *field; - - trace_assign_type(field, entry); + if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) + goto partial; - seq_print_userip_objs(field, s, sym_flags); - trace_seq_putc(s, '\n'); - break; - } - default: - trace_seq_printf(s, "Unknown type %d\n", entry->type); - } return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; } static enum print_line_t print_trace_fmt(struct trace_iterator *iter) @@ -1957,313 +1428,78 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; - unsigned long usec_rem; - unsigned long long t; - unsigned long secs; - char *comm; - int ret; - int S, T; - int i; + struct trace_event *event; entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - test_cpu_buff_start(iter); - comm = trace_find_cmdline(iter->ent->pid); - - t = ns2usecs(iter->ts); - usec_rem = do_div(t, 1000000ULL); - secs = (unsigned long)t; - - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "[%03d] ", iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + event = ftrace_find_event(entry->type); - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - ret = seq_print_ip_sym(s, field->ip, sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - field->parent_ip) { - ret = trace_seq_printf(s, " <-"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = seq_print_ip_sym(s, - field->parent_ip, - sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); - ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", - field->prev_pid, - field->prev_prio, - S, - entry->type == TRACE_CTX ? "==>" : " +", - field->next_cpu, - field->next_pid, - field->next_prio, - T); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (!trace_print_context(iter)) + goto partial; } - case TRACE_SPECIAL: { - struct special_entry *field; - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_STACK: { - struct stack_entry *field; - - trace_assign_type(field, entry); - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) { - ret = trace_seq_puts(s, " <= "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = seq_print_ip_sym(s, field->caller[i], - sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_puts(s, "\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_PRINT: { - struct print_entry *field; - - trace_assign_type(field, entry); - - seq_print_ip_sym(s, field->ip, sym_flags); - trace_seq_printf(s, ": %s", field->buf); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - break; - } - case TRACE_GRAPH_RET: { - return print_graph_function(iter); - } - case TRACE_GRAPH_ENT: { - return print_graph_function(iter); - } - case TRACE_BRANCH: { - struct trace_branch *field; + if (event && event->trace) + return event->trace(iter, sym_flags); - trace_assign_type(field, entry); + if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) + goto partial; - trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line); - break; - } - case TRACE_USER_STACK: { - struct userstack_entry *field; - - trace_assign_type(field, entry); - - ret = seq_print_userip_objs(field, s, sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_putc(s, '\n'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - } return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; - int ret; - int S, T; + struct trace_event *event; entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - - ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%x %x\n", - field->ip, - field->parent_ip); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = entry->type == TRACE_WAKE ? '+' : - task_state_char(field->prev_state); - ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_SPECIAL: - case TRACE_USER_STACK: - case TRACE_STACK: { - struct special_entry *field; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (!trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts)) + goto partial; } - case TRACE_PRINT: { - struct print_entry *field; - trace_assign_type(field, entry); + event = ftrace_find_event(entry->type); + if (event && event->raw) + return event->raw(iter, 0); + + if (!trace_seq_printf(s, "%d ?\n", entry->type)) + goto partial; - trace_seq_printf(s, "# %lx %s", field->ip, field->buf); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - break; - } - } return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; } -#define SEQ_PUT_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return 0; \ -} while (0) - -#define SEQ_PUT_HEX_FIELD_RET(s, x) \ -do { \ - BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ - if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return 0; \ -} while (0) - static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; - int S, T; + struct trace_event *event; entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); - - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - SEQ_PUT_HEX_FIELD_RET(s, field->ip); - SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); - break; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, iter->ts); } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = entry->type == TRACE_WAKE ? '+' : - task_state_char(field->prev_state); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); - SEQ_PUT_HEX_FIELD_RET(s, T); - break; - } - case TRACE_SPECIAL: - case TRACE_USER_STACK: - case TRACE_STACK: { - struct special_entry *field; - - trace_assign_type(field, entry); - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - break; - } + event = ftrace_find_event(entry->type); + if (event && event->hex) { + enum print_line_t ret = event->hex(iter, 0); + if (ret != TRACE_TYPE_HANDLED) + return ret; } + SEQ_PUT_FIELD_RET(s, newline); return TRACE_TYPE_HANDLED; @@ -2278,13 +1514,10 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, field->buf); + ret = trace_seq_printf(s, "%s", field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - return TRACE_TYPE_HANDLED; } @@ -2292,53 +1525,21 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; + struct trace_event *event; entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); - - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - SEQ_PUT_FIELD_RET(s, field->ip); - SEQ_PUT_FIELD_RET(s, field->parent_ip); - break; - } - case TRACE_CTX: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - SEQ_PUT_FIELD_RET(s, field->prev_pid); - SEQ_PUT_FIELD_RET(s, field->prev_prio); - SEQ_PUT_FIELD_RET(s, field->prev_state); - SEQ_PUT_FIELD_RET(s, field->next_pid); - SEQ_PUT_FIELD_RET(s, field->next_prio); - SEQ_PUT_FIELD_RET(s, field->next_state); - break; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, iter->ts); } - case TRACE_SPECIAL: - case TRACE_USER_STACK: - case TRACE_STACK: { - struct special_entry *field; - trace_assign_type(field, entry); + event = ftrace_find_event(entry->type); + if (event && event->binary) + return event->binary(iter, 0); - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - break; - } - } - return 1; + return TRACE_TYPE_HANDLED; } static int trace_empty(struct trace_iterator *iter) @@ -2383,7 +1584,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return print_raw_fmt(iter); if (iter->iter_flags & TRACE_FILE_LAT_FMT) - return print_lat_fmt(iter, iter->idx, iter->cpu); + return print_lat_fmt(iter); return print_trace_fmt(iter); } @@ -2985,7 +2186,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static int tracing_set_tracer(char *buf) +static int tracing_set_tracer(const char *buf) { struct trace_array *tr = &global_trace; struct tracer *t; @@ -3691,6 +2892,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) } EXPORT_SYMBOL_GPL(__ftrace_printk); +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { @@ -3871,14 +3081,10 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); register_tracer(&nop_trace); + current_trace = &nop_trace; #ifdef CONFIG_BOOT_TRACER register_tracer(&boot_tracer); - current_trace = &boot_tracer; - current_trace->init(&global_trace); -#else - current_trace = &nop_trace; #endif - /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -3895,5 +3101,26 @@ out_free_buffer_mask: out: return ret; } + +__init static int clear_boot_tracer(void) +{ + /* + * The default tracer at boot buffer is an init section. + * This function is called in lateinit. If we did not + * find the boot tracer, then clear it out, to prevent + * later registration from accessing the buffer that is + * about to be freed. + */ + if (!default_bootup_tracer) + return 0; + + printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", + default_bootup_tracer); + default_bootup_tracer = NULL; + + return 0; +} + early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); +late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4d3d381bfd9..f0c7a0f08ca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,6 +9,7 @@ #include <linux/mmiotrace.h> #include <linux/ftrace.h> #include <trace/boot.h> +#include <trace/kmemtrace.h> enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -16,7 +17,6 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, - TRACE_CONT, TRACE_STACK, TRACE_PRINT, TRACE_SPECIAL, @@ -29,9 +29,12 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, + TRACE_KMEM_ALLOC, + TRACE_KMEM_FREE, TRACE_POWER, + TRACE_BLK, - __TRACE_LAST_TYPE + __TRACE_LAST_TYPE, }; /* @@ -170,6 +173,24 @@ struct trace_power { struct power_trace state_data; }; +struct kmemtrace_alloc_entry { + struct trace_entry ent; + enum kmemtrace_type_id type_id; + unsigned long call_site; + const void *ptr; + size_t bytes_req; + size_t bytes_alloc; + gfp_t gfp_flags; + int node; +}; + +struct kmemtrace_free_entry { + struct trace_entry ent; + enum kmemtrace_type_id type_id; + unsigned long call_site; + const void *ptr; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -178,7 +199,6 @@ struct trace_power { * NEED_RESCED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler - * CONT - multiple entries hold the trace item */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, @@ -186,7 +206,6 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_CONT = 0x20, }; #define TRACE_BUF_SIZE 1024 @@ -262,7 +281,6 @@ extern void __ftrace_bad_type(void); do { \ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ - IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ @@ -280,6 +298,10 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ + IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ + TRACE_KMEM_ALLOC); \ + IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ + TRACE_KMEM_FREE); \ __ftrace_bad_type(); \ } while (0) @@ -313,6 +335,7 @@ struct tracer_flags { /* Makes more easy to define a tracer opt */ #define TRACER_OPT(s, b) .name = #s, .bit = b + /* * A specific tracer, represented by methods that operate on a trace array: */ @@ -340,6 +363,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; + struct tracer_stat *stats; }; struct trace_seq { @@ -381,6 +405,10 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); + +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); @@ -415,7 +443,6 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); -void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -434,15 +461,12 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -extern cycle_t ftrace_now(int cpu); +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc); -#ifdef CONFIG_FUNCTION_TRACER -void tracing_start_function_trace(void); -void tracing_stop_function_trace(void); -#else -# define tracing_start_function_trace() do { } while (0) -# define tracing_stop_function_trace() do { } while (0) -#endif +extern cycle_t ftrace_now(int cpu); #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void @@ -456,10 +480,10 @@ struct tracer_switch_ops { void *private; struct tracer_switch_ops *next; }; - -char *trace_find_cmdline(int pid); #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ +extern char *trace_find_cmdline(int pid); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func @@ -488,15 +512,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace, #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); -extern void trace_seq_print_cont(struct trace_seq *s, - struct trace_iterator *iter); - -extern int -seq_print_ip_sym(struct trace_seq *s, unsigned long ip, - unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); extern long ns2usecs(cycle_t nsec); extern int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); @@ -580,7 +595,8 @@ enum trace_iterator_flags { TRACE_ITER_ANNOTATE = 0x2000, TRACE_ITER_USERSTACKTRACE = 0x4000, TRACE_ITER_SYM_USEROBJ = 0x8000, - TRACE_ITER_PRINTK_MSGONLY = 0x10000 + TRACE_ITER_PRINTK_MSGONLY = 0x10000, + TRACE_ITER_CONTEXT_INFO = 0x20000 /* Print pid/cpu/time */ }; /* diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 366c8c333e1..1f07895977a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -11,6 +11,7 @@ #include <linux/kallsyms.h> #include "trace.h" +#include "trace_output.h" static struct trace_array *boot_trace; static bool pre_initcalls_finished; @@ -27,13 +28,13 @@ void start_boot_trace(void) void enable_boot_trace(void) { - if (pre_initcalls_finished) + if (boot_trace && pre_initcalls_finished) tracing_start_sched_switch_record(); } void disable_boot_trace(void) { - if (pre_initcalls_finished) + if (boot_trace && pre_initcalls_finished) tracing_stop_sched_switch_record(); } @@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; + if (!tr) + return 0; + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); @@ -131,7 +135,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!pre_initcalls_finished) + if (!tr || !pre_initcalls_finished) return; /* Get its name now since this function could @@ -163,7 +167,7 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!pre_initcalls_finished) + if (!tr || !pre_initcalls_finished) return; sprint_symbol(bt->func, (unsigned long)fn); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6c00feb3bac..7ac72a44b2d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -14,12 +14,17 @@ #include <linux/hash.h> #include <linux/fs.h> #include <asm/local.h> + #include "trace.h" +#include "trace_stat.h" +#include "trace_output.h" #ifdef CONFIG_BRANCH_TRACER +static struct tracer branch_trace; static int branch_tracing_enabled __read_mostly; static DEFINE_MUTEX(branch_tracing_mutex); + static struct trace_array *branch_tracer; static void @@ -128,11 +133,7 @@ static void stop_branch_trace(struct trace_array *tr) static int branch_trace_init(struct trace_array *tr) { - int cpu; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - + tracing_reset_online_cpus(tr); start_branch_trace(tr); return 0; } @@ -142,22 +143,74 @@ static void branch_trace_reset(struct trace_array *tr) stop_branch_trace(tr); } -struct tracer branch_trace __read_mostly = +static int +trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, entry); + + if (seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_branch_print(struct trace_iterator *iter, + int flags) +{ + struct trace_branch *field; + + trace_assign_type(field, iter->ent); + + if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + +static struct trace_event trace_branch_event = { + .type = TRACE_BRANCH, + .trace = trace_branch_print, + .latency_trace = trace_branch_print, + .raw = trace_nop_print, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + +static struct tracer branch_trace __read_mostly = { .name = "branch", .init = branch_trace_init, .reset = branch_trace_reset, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_branch, -#endif +#endif /* CONFIG_FTRACE_SELFTEST */ }; -__init static int init_branch_trace(void) +__init static int init_branch_tracer(void) { + int ret; + + ret = register_ftrace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "branch events\n"); + return 1; + } return register_tracer(&branch_trace); } +device_initcall(init_branch_tracer); -device_initcall(init_branch_trace); #else static inline void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) @@ -183,66 +236,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) } EXPORT_SYMBOL(ftrace_likely_update); -struct ftrace_pointer { - void *start; - void *stop; - int hit; -}; +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +static int annotated_branch_stat_headers(struct seq_file *m) { - const struct ftrace_pointer *f = m->private; - struct ftrace_branch_data *p = v; - - (*pos)++; - - if (v == (void *)1) - return f->start; - - ++p; - - if ((void *)p >= (void *)f->stop) - return NULL; - - return p; + seq_printf(m, " correct incorrect %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; } -static void *t_start(struct seq_file *m, loff_t *pos) +static inline long get_incorrect_percent(struct ftrace_branch_data *p) { - void *t = (void *)1; - loff_t l = 0; - - for (; t && l < *pos; t = t_next(m, t, &l)) - ; + long percent; - return t; -} + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; -static void t_stop(struct seq_file *m, void *p) -{ + return percent; } -static int t_show(struct seq_file *m, void *v) +static int branch_stat_show(struct seq_file *m, void *v) { - const struct ftrace_pointer *fp = m->private; struct ftrace_branch_data *p = v; const char *f; long percent; - if (v == (void *)1) { - if (fp->hit) - seq_printf(m, " miss hit %% "); - else - seq_printf(m, " correct incorrect %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); - return 0; - } - /* Only print the file, not the path */ f = p->file + strlen(p->file); while (f >= p->file && *f != '/') @@ -252,11 +278,7 @@ static int t_show(struct seq_file *m, void *v) /* * The miss is overlayed on correct, and hit on incorrect. */ - if (p->correct) { - percent = p->incorrect * 100; - percent /= p->correct + p->incorrect; - } else - percent = p->incorrect ? 100 : -1; + percent = get_incorrect_percent(p); seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) @@ -267,76 +289,118 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracing_likely_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, +static void *annotated_branch_stat_start(void) +{ + return __start_annotated_branch_profile; +} + +static void * +annotated_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_annotated_branch_profile) + return NULL; + + return p; +} + +static int annotated_branch_stat_cmp(void *p1, void *p2) +{ + struct ftrace_branch_data *a = p1; + struct ftrace_branch_data *b = p2; + + long percent_a, percent_b; + + percent_a = get_incorrect_percent(a); + percent_b = get_incorrect_percent(b); + + if (percent_a < percent_b) + return -1; + if (percent_a > percent_b) + return 1; + else + return 0; +} + +static struct tracer_stat annotated_branch_stats = { + .name = "branch_annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = branch_stat_show }; -static int tracing_branch_open(struct inode *inode, struct file *file) +__init static int init_annotated_branch_stats(void) { int ret; - ret = seq_open(file, &tracing_likely_seq_ops); + ret = register_stat_tracer(&annotated_branch_stats); if (!ret) { - struct seq_file *m = file->private_data; - m->private = (void *)inode->i_private; + printk(KERN_WARNING "Warning: could not register " + "annotated branches stats\n"); + return 1; } - - return ret; + return 0; } - -static const struct file_operations tracing_branch_fops = { - .open = tracing_branch_open, - .read = seq_read, - .llseek = seq_lseek, -}; +fs_initcall(init_annotated_branch_stats); #ifdef CONFIG_PROFILE_ALL_BRANCHES + extern unsigned long __start_branch_profile[]; extern unsigned long __stop_branch_profile[]; -static const struct ftrace_pointer ftrace_branch_pos = { - .start = __start_branch_profile, - .stop = __stop_branch_profile, - .hit = 1, -}; +static int all_branch_stat_headers(struct seq_file *m) +{ + seq_printf(m, " miss hit %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} -#endif /* CONFIG_PROFILE_ALL_BRANCHES */ +static void *all_branch_stat_start(void) +{ + return __start_branch_profile; +} -extern unsigned long __start_annotated_branch_profile[]; -extern unsigned long __stop_annotated_branch_profile[]; +static void * +all_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; -static const struct ftrace_pointer ftrace_annotated_branch_pos = { - .start = __start_annotated_branch_profile, - .stop = __stop_annotated_branch_profile, -}; + ++p; -static __init int ftrace_branch_init(void) -{ - struct dentry *d_tracer; - struct dentry *entry; + if ((void *)p >= (void *)__stop_branch_profile) + return NULL; - d_tracer = tracing_init_dentry(); + return p; +} - entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, - (void *)&ftrace_annotated_branch_pos, - &tracing_branch_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'profile_annotatet_branch' entry\n"); +static struct tracer_stat all_branch_stats = { + .name = "branch_all", + .stat_start = all_branch_stat_start, + .stat_next = all_branch_stat_next, + .stat_headers = all_branch_stat_headers, + .stat_show = branch_stat_show +}; -#ifdef CONFIG_PROFILE_ALL_BRANCHES - entry = debugfs_create_file("profile_branch", 0444, d_tracer, - (void *)&ftrace_branch_pos, - &tracing_branch_fops); - if (!entry) - pr_warning("Could not create debugfs" - " 'profile_branch' entry\n"); -#endif +__init static int all_annotated_branch_stats(void) +{ + int ret; + ret = register_stat_tracer(&all_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "all branches stats\n"); + return 1; + } return 0; } - -device_initcall(ftrace_branch_init); +fs_initcall(all_annotated_branch_stats); +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 9236d7e25a1..b3a320f8aba 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,8 +16,17 @@ #include "trace.h" +/* function tracing enabled */ +static int ftrace_function_enabled; + +static struct trace_array *func_trace; + +static void tracing_start_function_trace(void); +static void tracing_stop_function_trace(void); + static void start_function_trace(struct trace_array *tr) { + func_trace = tr; tr->cpu = get_cpu(); tracing_reset_online_cpus(tr); put_cpu(); @@ -48,14 +57,188 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } +static void +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, resched; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + local_save_flags(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags, pc); + + atomic_dec(&data->disabled); + ftrace_preempt_enable(resched); +} + +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + /* + * skip over 5 funcs: + * __ftrace_trace_stack, + * __trace_stack, + * function_stack_trace_call + * ftrace_list_func + * ftrace_call + */ + __trace_stack(tr, data, flags, 5, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +static struct ftrace_ops trace_stack_ops __read_mostly = +{ + .func = function_stack_trace_call, +}; + +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static struct tracer_opt func_opts[] = { +#ifdef CONFIG_STACKTRACE + { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags func_flags = { + .val = 0, /* By default: all flags disabled */ + .opts = func_opts +}; + +static void tracing_start_function_trace(void) +{ + ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + register_ftrace_function(&trace_stack_ops); + else + register_ftrace_function(&trace_ops); + + ftrace_function_enabled = 1; +} + +static void tracing_stop_function_trace(void) +{ + ftrace_function_enabled = 0; + /* OK if they are not registered */ + unregister_ftrace_function(&trace_stack_ops); + unregister_ftrace_function(&trace_ops); +} + +static int func_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_FUNC_OPT_STACK) { + /* do nothing if already set */ + if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) + return 0; + + if (set) { + unregister_ftrace_function(&trace_ops); + register_ftrace_function(&trace_stack_ops); + } else { + unregister_ftrace_function(&trace_stack_ops); + register_ftrace_function(&trace_ops); + } + + return 0; + } + + return -EINVAL; +} + static struct tracer function_trace __read_mostly = { - .name = "function", - .init = function_trace_init, - .reset = function_trace_reset, - .start = function_trace_start, + .name = "function", + .init = function_trace_init, + .reset = function_trace_reset, + .start = function_trace_start, + .flags = &func_flags, + .set_flag = func_set_flag, #ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_function, + .selftest = trace_selftest_startup_function, #endif }; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 930c08e5b38..c97594d826b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1,7 +1,7 @@ /* * * Function graph tracer. - * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> * Mostly borrowed from function tracer which * is Copyright (c) Steven Rostedt <srostedt@redhat.com> * @@ -12,6 +12,7 @@ #include <linux/fs.h> #include "trace.h" +#include "trace_output.h" #define TRACE_GRAPH_INDENT 2 @@ -20,9 +21,11 @@ #define TRACE_GRAPH_PRINT_CPU 0x2 #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 #define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 static struct tracer_opt trace_opts[] = { - /* Display overruns ? */ + /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, /* Display CPU ? */ { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, @@ -30,29 +33,30 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, /* Display proc name/pid */ { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + /* Display duration of execution */ + { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, + /* Display absolute time of an entry */ + { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ - .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | + TRACE_GRAPH_PRINT_DURATION, .opts = trace_opts }; /* pid on the last trace processed */ -static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; + static int graph_trace_init(struct trace_array *tr) { - int cpu, ret; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - - ret = register_ftrace_graph(&trace_graph_return, + int ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); if (ret) return ret; + tracing_reset_online_cpus(tr); tracing_start_cmdline_record(); return 0; @@ -153,17 +157,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* If the pid changed since the last trace, output this event */ static enum print_line_t -verif_pid(struct trace_seq *s, pid_t pid, int cpu) +verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) { pid_t prev_pid; + pid_t *last_pid; int ret; - if (last_pid[cpu] != -1 && last_pid[cpu] == pid) + if (!last_pids_cpu) return TRACE_TYPE_HANDLED; - prev_pid = last_pid[cpu]; - last_pid[cpu] = pid; + last_pid = per_cpu_ptr(last_pids_cpu, cpu); + + if (*last_pid == pid) + return TRACE_TYPE_HANDLED; + prev_pid = *last_pid; + *last_pid = pid; + + if (prev_pid == -1) + return TRACE_TYPE_HANDLED; /* * Context-switch trace line: @@ -231,9 +243,34 @@ trace_branch_is_leaf(struct trace_iterator *iter, return true; } +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s) +{ + /* If duration disappear, we don't need anything */ + if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) + return 1; + + /* Non nested entry or return */ + if (duration == -1) + return trace_seq_printf(s, " "); + + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + } + + return trace_seq_printf(s, " "); +} + static enum print_line_t print_graph_irq(struct trace_seq *s, unsigned long addr, - enum trace_type type, int cpu, pid_t pid) + enum trace_type type, int cpu, pid_t pid) { int ret; @@ -241,35 +278,40 @@ print_graph_irq(struct trace_seq *s, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; - if (type == TRACE_GRAPH_ENT) { - ret = trace_seq_printf(s, "==========> | "); - } else { - /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + /* No overhead */ + ret = print_graph_overhead(-1, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - /* No overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (type == TRACE_GRAPH_ENT) + ret = trace_seq_printf(s, "==========>"); + else + ret = trace_seq_printf(s, "<=========="); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Don't close the duration column if haven't one */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + trace_seq_printf(s, " |"); + ret = trace_seq_printf(s, "\n"); - ret = trace_seq_printf(s, "<========== |\n"); - } if (!ret) return TRACE_TYPE_PARTIAL_LINE; return TRACE_TYPE_HANDLED; @@ -288,7 +330,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) sprintf(msecs_str, "%lu", (unsigned long) duration); /* Print msecs */ - ret = trace_seq_printf(s, msecs_str); + ret = trace_seq_printf(s, "%s", msecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -321,19 +363,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s) +static int print_graph_abs_time(u64 t, struct trace_seq *s) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - return trace_seq_printf(s, "! "); + unsigned long usecs_rem; - /* Duration exceeded 10 msecs */ - if (duration > 10000ULL) - return trace_seq_printf(s, "+ "); + usecs_rem = do_div(t, 1000000000); + usecs_rem /= 1000; - return trace_seq_printf(s, " "); + return trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); } /* Case of a leaf function on its call entry */ @@ -356,16 +394,16 @@ print_graph_entry_leaf(struct trace_iterator *iter, duration = graph_ret->rettime - graph_ret->calltime; /* Overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = print_graph_overhead(duration, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -394,25 +432,17 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ent *call = &entry->graph_ent; /* No overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_overhead(-1, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - /* Interrupt */ - ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); - if (ret == TRACE_TYPE_UNHANDLED) { - /* No time */ + /* No time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - } else { - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; } - /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -433,15 +463,30 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter, int cpu) + struct trace_iterator *iter) { int ret; + int cpu = iter->cpu; + pid_t *last_entry = iter->private; struct trace_entry *ent = iter->ent; + struct ftrace_graph_ent *call = &field->graph_ent; /* Pid */ - if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Interrupt */ + ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); @@ -469,16 +514,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent, int cpu) + struct trace_entry *ent, struct trace_iterator *iter) { int i; int ret; + int cpu = iter->cpu; + pid_t *last_pid = iter->private; unsigned long long duration = trace->rettime - trace->calltime; /* Pid */ - if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); @@ -498,16 +552,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } /* Overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = print_graph_overhead(duration, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { @@ -541,14 +595,23 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, { int i; int ret; + int cpu = iter->cpu; + pid_t *last_pid = iter->private; + + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Pid */ - if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, iter->cpu); + ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } @@ -565,17 +628,17 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, } /* No overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = trace_seq_printf(s, " "); + ret = print_graph_overhead(-1, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* No time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } - /* No time */ - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* Indentation */ if (trace->depth > 0) for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { @@ -589,8 +652,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (ent->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); + /* Strip ending newline */ + if (s->buffer[s->len - 1] == '\n') { + s->buffer[s->len - 1] = '\0'; + s->len--; + } ret = trace_seq_printf(s, " */\n"); if (!ret) @@ -610,13 +676,12 @@ print_graph_function(struct trace_iterator *iter) case TRACE_GRAPH_ENT: { struct ftrace_graph_ent_entry *field; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter, - iter->cpu); + return print_graph_entry(field, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter->cpu); + return print_graph_return(&field->ret, s, entry, iter); } case TRACE_PRINT: { struct print_entry *field; @@ -632,28 +697,55 @@ static void print_graph_headers(struct seq_file *s) { /* 1st line */ seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU "); + seq_printf(s, "CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, "TASK/PID "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) - seq_printf(s, "OVERHEAD/"); - seq_printf(s, "DURATION FUNCTION CALLS\n"); + seq_printf(s, " TASK/PID "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + seq_printf(s, " DURATION "); + seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, "| "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, "| | "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - seq_printf(s, "| "); - seq_printf(s, "| | | | |\n"); - } else - seq_printf(s, " | | | | |\n"); + seq_printf(s, " | | "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + seq_printf(s, " | | "); + seq_printf(s, " | | | |\n"); +} + +static void graph_trace_open(struct trace_iterator *iter) +{ + /* pid on the last trace processed */ + pid_t *last_pid = alloc_percpu(pid_t); + int cpu; + + if (!last_pid) + pr_warning("function graph tracer: not enough memory\n"); + else + for_each_possible_cpu(cpu) { + pid_t *pid = per_cpu_ptr(last_pid, cpu); + *pid = -1; + } + + iter->private = last_pid; } + +static void graph_trace_close(struct trace_iterator *iter) +{ + percpu_free(iter->private); +} + static struct tracer graph_trace __read_mostly = { .name = "function_graph", + .open = graph_trace_open, + .close = graph_trace_close, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 649df22d435..fff3545fc86 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,7 +1,8 @@ /* * h/w branch tracer for x86 based on bts * - * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> + * Copyright (C) 2008-2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 * */ @@ -10,21 +11,44 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/kallsyms.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/smp.h> #include <asm/ds.h> #include "trace.h" +#include "trace_output.h" #define SIZEOF_BTS (1 << 13) +/* The tracer mutex protects the below per-cpu tracer array. + It needs to be held to: + - start tracing on all cpus + - stop tracing on all cpus + - start tracing on a single hotplug cpu + - stop tracing on a single hotplug cpu + - read the trace from all cpus + - read the trace from a single cpu +*/ +static DEFINE_MUTEX(bts_tracer_mutex); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) #define this_buffer per_cpu(buffer, smp_processor_id()) +static int __read_mostly trace_hw_branches_enabled; +static struct trace_array *hw_branch_trace __read_mostly; + +/* + * Start tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_mutex must be locked. + */ static void bts_trace_start_cpu(void *arg) { if (this_tracer) @@ -42,14 +66,20 @@ static void bts_trace_start_cpu(void *arg) static void bts_trace_start(struct trace_array *tr) { - int cpu; + mutex_lock(&bts_tracer_mutex); - tracing_reset_online_cpus(tr); + on_each_cpu(bts_trace_start_cpu, NULL, 1); + trace_hw_branches_enabled = 1; - for_each_cpu(cpu, cpu_possible_mask) - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + mutex_unlock(&bts_tracer_mutex); } +/* + * Start tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_mutex must be locked. + */ static void bts_trace_stop_cpu(void *arg) { if (this_tracer) { @@ -60,26 +90,63 @@ static void bts_trace_stop_cpu(void *arg) static void bts_trace_stop(struct trace_array *tr) { - int cpu; + mutex_lock(&bts_tracer_mutex); + + trace_hw_branches_enabled = 0; + on_each_cpu(bts_trace_stop_cpu, NULL, 1); - for_each_cpu(cpu, cpu_possible_mask) + mutex_unlock(&bts_tracer_mutex); +} + +static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + mutex_lock(&bts_tracer_mutex); + + if (!trace_hw_branches_enabled) + goto out; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + break; + case CPU_DOWN_PREPARE: smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); + break; + } + + out: + mutex_unlock(&bts_tracer_mutex); + return NOTIFY_DONE; } +static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { + .notifier_call = bts_hotcpu_handler +}; + static int bts_trace_init(struct trace_array *tr) { + hw_branch_trace = tr; + + register_hotcpu_notifier(&bts_hotcpu_notifier); tracing_reset_online_cpus(tr); bts_trace_start(tr); return 0; } +static void bts_trace_reset(struct trace_array *tr) +{ + bts_trace_stop(tr); + unregister_hotcpu_notifier(&bts_hotcpu_notifier); +} + static void bts_trace_print_header(struct seq_file *m) { - seq_puts(m, - "# CPU# FROM TO FUNCTION\n"); - seq_puts(m, - "# | | | |\n"); + seq_puts(m, "# CPU# TO <- FROM\n"); } static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) @@ -87,15 +154,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; + unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); if (entry->type == TRACE_HW_BRANCHES) { if (trace_seq_printf(seq, "%4d ", entry->cpu) && - trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", - it->from, it->to) && - (!it->from || - seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) && + seq_print_ip_sym(seq, it->to, symflags) && + trace_seq_printf(seq, "\t <- ") && + seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; return TRACE_TYPE_PARTIAL_LINE;; @@ -103,26 +170,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) return TRACE_TYPE_UNHANDLED; } -void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) +void trace_hw_branch(u64 from, u64 to) { + struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; - unsigned long irq; + unsigned long irq1, irq2; + int cpu; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); - if (!event) + if (unlikely(!tr)) return; + + if (unlikely(!trace_hw_branches_enabled)) + return; + + local_irq_save(irq1); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2); + if (!event) + goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, from); entry->ent.type = TRACE_HW_BRANCHES; - entry->ent.cpu = smp_processor_id(); + entry->ent.cpu = cpu; entry->from = from; entry->to = to; - ring_buffer_unlock_commit(tr->buffer, event, irq); + ring_buffer_unlock_commit(tr->buffer, event, irq2); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(irq1); } -static void trace_bts_at(struct trace_array *tr, - const struct bts_trace *trace, void *at) +static void trace_bts_at(const struct bts_trace *trace, void *at) { struct bts_struct bts; int err = 0; @@ -137,18 +220,29 @@ static void trace_bts_at(struct trace_array *tr, switch (bts.qualifier) { case BTS_BRANCH: - trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); + trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); break; } } +/* + * Collect the trace on the current cpu and write it into the ftrace buffer. + * + * pre: bts_tracer_mutex must be locked + */ static void trace_bts_cpu(void *arg) { struct trace_array *tr = (struct trace_array *) arg; const struct bts_trace *trace; unsigned char *at; - if (!this_tracer) + if (unlikely(!tr)) + return; + + if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) + return; + + if (unlikely(!this_tracer)) return; ds_suspend_bts(this_tracer); @@ -158,11 +252,11 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) - trace_bts_at(tr, trace, at); + trace_bts_at(trace, at); for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) - trace_bts_at(tr, trace, at); + trace_bts_at(trace, at); out: ds_resume_bts(this_tracer); @@ -170,22 +264,38 @@ out: static void trace_bts_prepare(struct trace_iterator *iter) { - int cpu; + mutex_lock(&bts_tracer_mutex); + + on_each_cpu(trace_bts_cpu, iter->tr, 1); + + mutex_unlock(&bts_tracer_mutex); +} + +static void trace_bts_close(struct trace_iterator *iter) +{ + tracing_reset_online_cpus(iter->tr); +} + +void trace_hw_branch_oops(void) +{ + mutex_lock(&bts_tracer_mutex); + + trace_bts_cpu(hw_branch_trace); - for_each_cpu(cpu, cpu_possible_mask) - smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); + mutex_unlock(&bts_tracer_mutex); } struct tracer bts_tracer __read_mostly = { .name = "hw-branch-tracer", .init = bts_trace_init, - .reset = bts_trace_stop, + .reset = bts_trace_reset, .print_header = bts_trace_print_header, .print_line = bts_trace_print_line, .start = bts_trace_start, .stop = bts_trace_stop, - .open = trace_bts_prepare + .open = trace_bts_prepare, + .close = trace_bts_close }; __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 62a78d94353..ed344b022a1 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -353,28 +353,18 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -/* - * save_tracer_enabled is used to save the state of the tracer_enabled - * variable when we disable it when we open a trace output file. - */ -static int save_tracer_enabled; - static void start_irqsoff_tracer(struct trace_array *tr) { register_ftrace_function(&trace_ops); - if (tracing_is_enabled()) { + if (tracing_is_enabled()) tracer_enabled = 1; - save_tracer_enabled = 1; - } else { + else tracer_enabled = 0; - save_tracer_enabled = 0; - } } static void stop_irqsoff_tracer(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); } @@ -395,25 +385,11 @@ static void irqsoff_tracer_reset(struct trace_array *tr) static void irqsoff_tracer_start(struct trace_array *tr) { tracer_enabled = 1; - save_tracer_enabled = 1; } static void irqsoff_tracer_stop(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; -} - -static void irqsoff_tracer_open(struct trace_iterator *iter) -{ - /* stop the trace while dumping */ - tracer_enabled = 0; -} - -static void irqsoff_tracer_close(struct trace_iterator *iter) -{ - /* restart tracing */ - tracer_enabled = save_tracer_enabled; } #ifdef CONFIG_IRQSOFF_TRACER @@ -431,8 +407,6 @@ static struct tracer irqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -459,8 +433,6 @@ static struct tracer preemptoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -489,8 +461,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fffcb069f1d..ec78e244242 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,8 +9,10 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <asm/atomic.h> #include "trace.h" +#include "trace_output.h" struct header_iter { struct pci_dev *dev; @@ -19,6 +21,7 @@ struct header_iter { static struct trace_array *mmio_trace_array; static bool overrun_detected; static unsigned long prev_overruns; +static atomic_t dropped_count; static void mmio_reset_data(struct trace_array *tr) { @@ -121,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { - unsigned long cnt = 0; + unsigned long cnt = atomic_xchg(&dropped_count, 0); unsigned long over = ring_buffer_overruns(iter->tr->buffer); if (over > prev_overruns) - cnt = over - prev_overruns; + cnt += over - prev_overruns; prev_overruns = over; return cnt; } @@ -181,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) switch (rw->opcode) { case MMIO_READ: ret = trace_seq_printf(s, - "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, - "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, - "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," + "%02lx 0x%lx %d\n", secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, @@ -227,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) switch (m->opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, - "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: ret = trace_seq_printf(s, - "UNMAP %lu.%06lu %d 0x%lx %d\n", + "UNMAP %u.%06lu %d 0x%lx %d\n", secs, usec_rem, m->map_id, 0UL, 0); break; default: @@ -258,13 +262,10 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); + ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - return TRACE_TYPE_HANDLED; } @@ -310,8 +311,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); - if (!event) + if (!event) { + atomic_inc(&dropped_count); return; + } entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_RW; @@ -338,8 +341,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr, event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); - if (!event) + if (!event) { + atomic_inc(&dropped_count); return; + } entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_MAP; diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index b9767acd30a..087b6cbf4ea 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -47,12 +47,8 @@ static void stop_nop_trace(struct trace_array *tr) static int nop_trace_init(struct trace_array *tr) { - int cpu; ctx_trace = tr; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - + tracing_reset_online_cpus(tr); start_nop_trace(tr); return 0; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c new file mode 100644 index 00000000000..b7380eee9fa --- /dev/null +++ b/kernel/trace/trace_output.c @@ -0,0 +1,910 @@ +/* + * trace_output.c + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + */ + +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ftrace.h> + +#include "trace_output.h" + +/* must be a power of 2 */ +#define EVENT_HASHSIZE 128 + +static DEFINE_MUTEX(trace_event_mutex); +static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; + +static int next_event_type = __TRACE_LAST_TYPE + 1; + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + int ret; + + if (!len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +int trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +int trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + +int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data = mem; + int i, j; + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); + } + hex[j++] = ' '; + + return trace_seq_putmem(s, hex, j); +} + +int trace_seq_path(struct trace_seq *s, struct path *path) +{ + unsigned char *p; + + if (s->len >= (PAGE_SIZE - 1)) + return 0; + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + return 0; +} + +#ifdef CONFIG_KRETPROBES +static inline const char *kretprobed(const char *name) +{ + static const char tramp_name[] = "kretprobe_trampoline"; + int size = sizeof(tramp_name); + + if (strncmp(tramp_name, name, size) == 0) + return "[unknown/kretprobe'd]"; + return name; +} +#else +static inline const char *kretprobed(const char *name) +{ + return name; +} +#endif /* CONFIG_KRETPROBES */ + +static int +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); +#endif + return 1; +} + +static int +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + sprint_symbol(str, address); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); +#endif + return 1; +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (mm) { + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); + } + up_read(&mm->mmap_sem); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + struct mm_struct *mm = NULL; + int ret = 1; + unsigned int i; + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->ent.tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || !ret) + break; + if (i && ret) + ret = trace_seq_puts(s, " <- "); + if (!ip) { + if (ret) + ret = trace_seq_puts(s, "??"); + continue; + } + if (!ret) + break; + if (ret) + ret = seq_print_user_ip(s, mm, ip, sym_flags); + } + + if (mm) + mmput(mm); + return ret; +} + +int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + ret = seq_print_sym_offset(s, "%s", ip); + else + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; + + if (sym_flags & TRACE_ITER_SYM_ADDR) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", + comm, entry->pid, cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? + 'X' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? + 'N' : '.', + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : softirq ? 's' : '.')) + return 0; + + if (entry->preempt_count) + return trace_seq_printf(s, "%x", entry->preempt_count); + return trace_seq_puts(s, "."); +} + +static unsigned long preempt_mark_thresh = 100; + +static int +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, + unsigned long rel_usecs) +{ + return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, + rel_usecs > preempt_mark_thresh ? '!' : + rel_usecs > 1 ? '+' : ' '); +} + +int trace_print_context(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + char *comm = trace_find_cmdline(entry->pid); + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned long secs = (unsigned long)t; + + return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", + comm, entry->pid, entry->cpu, secs, usec_rem); +} + +int trace_print_lat_context(struct trace_iterator *iter) +{ + u64 next_ts; + int ret; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent, + *next_entry = trace_find_next_entry(iter, NULL, + &next_ts); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); + unsigned long rel_usecs; + + if (!next_entry) + next_ts = iter->ts; + rel_usecs = ns2usecs(next_ts - iter->ts); + + if (verbose) { + char *comm = trace_find_cmdline(entry->pid); + ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", comm, + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, iter->idx, + ns2usecs(iter->ts), + abs_usecs / USEC_PER_MSEC, + abs_usecs % USEC_PER_MSEC, + rel_usecs / USEC_PER_MSEC, + rel_usecs % USEC_PER_MSEC); + } else { + ret = lat_print_generic(s, entry, entry->cpu); + if (ret) + ret = lat_print_timestamp(s, abs_usecs, rel_usecs); + } + + return ret; +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + +/** + * ftrace_find_event - find a registered event + * @type: the type of event to look for + * + * Returns an event of type @type otherwise NULL + */ +struct trace_event *ftrace_find_event(int type) +{ + struct trace_event *event; + struct hlist_node *n; + unsigned key; + + key = type & (EVENT_HASHSIZE - 1); + + hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + if (event->type == type) + return event; + } + + return NULL; +} + +/** + * register_ftrace_event - register output for an event type + * @event: the event type to register + * + * Event types are stored in a hash and this hash is used to + * find a way to print an event. If the @event->type is set + * then it will use that type, otherwise it will assign a + * type to use. + * + * If you assign your own type, please make sure it is added + * to the trace_type enum in trace.h, to avoid collisions + * with the dynamic types. + * + * Returns the event type number or zero on error. + */ +int register_ftrace_event(struct trace_event *event) +{ + unsigned key; + int ret = 0; + + mutex_lock(&trace_event_mutex); + + if (!event->type) + event->type = next_event_type++; + else if (event->type > __TRACE_LAST_TYPE) { + printk(KERN_WARNING "Need to add type to trace.h\n"); + WARN_ON(1); + } + + if (ftrace_find_event(event->type)) + goto out; + + key = event->type & (EVENT_HASHSIZE - 1); + + hlist_add_head_rcu(&event->node, &event_hash[key]); + + ret = event->type; + out: + mutex_unlock(&trace_event_mutex); + + return ret; +} + +/** + * unregister_ftrace_event - remove a no longer used event + * @event: the event to remove + */ +int unregister_ftrace_event(struct trace_event *event) +{ + mutex_lock(&trace_event_mutex); + hlist_del(&event->node); + mutex_unlock(&trace_event_mutex); + + return 0; +} + +/* + * Standard events + */ + +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) +{ + return TRACE_TYPE_HANDLED; +} + +/* TRACE_FN */ +static enum print_line_t trace_fn_latency(struct trace_iterator *iter, + int flags) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + if (!trace_seq_puts(s, " (")) + goto partial; + if (!seq_print_ip_sym(s, field->parent_ip, flags)) + goto partial; + if (!trace_seq_puts(s, ")\n")) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { + if (!trace_seq_printf(s, " <-")) + goto partial; + if (!seq_print_ip_sym(s, + field->parent_ip, + flags)) + goto partial; + } + if (!trace_seq_printf(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) +{ + struct ftrace_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD_RET(s, field->ip); + SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD_RET(s, field->ip); + SEQ_PUT_FIELD_RET(s, field->parent_ip); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .trace = trace_fn_trace, + .latency_trace = trace_fn_latency, + .raw = trace_fn_raw, + .hex = trace_fn_hex, + .binary = trace_fn_bin, +}; + +/* TRACE_CTX an TRACE_WAKE */ +static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, + char *delim) +{ + struct ctx_switch_entry *field; + char *comm; + int S, T; + + trace_assign_type(field, iter->ent); + + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); + comm = trace_find_cmdline(field->next_pid); + if (!trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) +{ + return trace_ctxwake_print(iter, "==>"); +} + +static enum print_line_t trace_wake_print(struct trace_iterator *iter, + int flags) +{ + return trace_ctxwake_print(iter, " +"); +} + +static int trace_ctxwake_raw(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + task_state_char(field->prev_state); + T = task_state_char(field->next_state); + if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) +{ + return trace_ctxwake_raw(iter, 0); +} + +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) +{ + return trace_ctxwake_raw(iter, '+'); +} + + +static int trace_ctxwake_hex(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + task_state_char(field->prev_state); + T = task_state_char(field->next_state); + + SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); + SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); + SEQ_PUT_HEX_FIELD_RET(s, T); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) +{ + return trace_ctxwake_hex(iter, 0); +} + +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) +{ + return trace_ctxwake_hex(iter, '+'); +} + +static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, + int flags) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD_RET(s, field->prev_pid); + SEQ_PUT_FIELD_RET(s, field->prev_prio); + SEQ_PUT_FIELD_RET(s, field->prev_state); + SEQ_PUT_FIELD_RET(s, field->next_pid); + SEQ_PUT_FIELD_RET(s, field->next_prio); + SEQ_PUT_FIELD_RET(s, field->next_state); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .trace = trace_ctx_print, + .latency_trace = trace_ctx_print, + .raw = trace_ctx_raw, + .hex = trace_ctx_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .trace = trace_wake_print, + .latency_trace = trace_wake_print, + .raw = trace_wake_raw, + .hex = trace_wake_hex, + .binary = trace_ctxwake_bin, +}; + +/* TRACE_SPECIAL */ +static enum print_line_t trace_special_print(struct trace_iterator *iter, + int flags) +{ + struct special_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", + field->arg1, + field->arg2, + field->arg3)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_special_hex(struct trace_iterator *iter, + int flags) +{ + struct special_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD_RET(s, field->arg1); + SEQ_PUT_HEX_FIELD_RET(s, field->arg2); + SEQ_PUT_HEX_FIELD_RET(s, field->arg3); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_special_bin(struct trace_iterator *iter, + int flags) +{ + struct special_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD_RET(s, field->arg1); + SEQ_PUT_FIELD_RET(s, field->arg2); + SEQ_PUT_FIELD_RET(s, field->arg3); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event trace_special_event = { + .type = TRACE_SPECIAL, + .trace = trace_special_print, + .latency_trace = trace_special_print, + .raw = trace_special_print, + .hex = trace_special_hex, + .binary = trace_special_bin, +}; + +/* TRACE_STACK */ + +static enum print_line_t trace_stack_print(struct trace_iterator *iter, + int flags) +{ + struct stack_entry *field; + struct trace_seq *s = &iter->seq; + int i; + + trace_assign_type(field, iter->ent); + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + if (!trace_seq_puts(s, " <= ")) + goto partial; + + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; + } + if (!trace_seq_puts(s, "\n")) + goto partial; + } + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .trace = trace_stack_print, + .latency_trace = trace_stack_print, + .raw = trace_special_print, + .hex = trace_special_hex, + .binary = trace_special_bin, +}; + +/* TRACE_USER_STACK */ +static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, + int flags) +{ + struct userstack_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_userip_objs(field, s, flags)) + goto partial; + + if (!trace_seq_putc(s, '\n')) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .trace = trace_user_stack_print, + .latency_trace = trace_user_stack_print, + .raw = trace_special_print, + .hex = trace_special_hex, + .binary = trace_special_bin, +}; + +/* TRACE_PRINT */ +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags) +{ + struct print_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .trace = trace_print_print, + .latency_trace = trace_print_print, + .raw = trace_print_raw, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + +static struct trace_event *events[] __initdata = { + &trace_fn_event, + &trace_ctx_event, + &trace_wake_event, + &trace_special_event, + &trace_stack_event, + &trace_user_stack_event, + &trace_print_event, + NULL +}; + +__init static int init_events(void) +{ + struct trace_event *event; + int i, ret; + + for (i = 0; events[i]; i++) { + event = events[i]; + + ret = register_ftrace_event(event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + event->type); + WARN_ON_ONCE(1); + } + } + + return 0; +} +device_initcall(init_events); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h new file mode 100644 index 00000000000..551a25a7221 --- /dev/null +++ b/kernel/trace/trace_output.h @@ -0,0 +1,62 @@ +#ifndef __TRACE_EVENTS_H +#define __TRACE_EVENTS_H + +#include "trace.h" + +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags); + +struct trace_event { + struct hlist_node node; + int type; + trace_print_func trace; + trace_print_func latency_trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +int trace_seq_puts(struct trace_seq *s, const char *str); +int trace_seq_putc(struct trace_seq *s, unsigned char c); +int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); +int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); +int trace_seq_path(struct trace_seq *s, struct path *path); +int seq_print_userip_objs(const struct userstack_entry *entry, + struct trace_seq *s, unsigned long sym_flags); +int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags); + +int trace_print_context(struct trace_iterator *iter); +int trace_print_lat_context(struct trace_iterator *iter); + +struct trace_event *ftrace_find_event(int type); +int register_ftrace_event(struct trace_event *event); +int unregister_ftrace_event(struct trace_event *event); + +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); + +#define MAX_MEMHEX_BYTES 8 +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) + +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return TRACE_TYPE_PARTIAL_LINE; \ +} while (0) + +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return TRACE_TYPE_PARTIAL_LINE; \ +} while (0) + +#endif + diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 7bda248daf5..faa6ab7a1f5 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include "trace.h" +#include "trace_output.h" static struct trace_array *power_trace; static int __read_mostly trace_power_enabled; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 42ae1e77b6b..a48c9b4b0c8 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -25,6 +25,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; static unsigned wakeup_prio = -1; +static int wakeup_rt; static raw_spinlock_t wakeup_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -152,6 +153,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, goto out_unlock; trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc); /* * usecs conversion is slow so we try to delay the conversion @@ -182,13 +184,10 @@ out: static void __wakeup_reset(struct trace_array *tr) { - struct trace_array_cpu *data; int cpu; - for_each_possible_cpu(cpu) { - data = tr->data[cpu]; + for_each_possible_cpu(cpu) tracing_reset(tr, cpu); - } wakeup_cpu = -1; wakeup_prio = -1; @@ -213,6 +212,7 @@ static void wakeup_reset(struct trace_array *tr) static void probe_wakeup(struct rq *rq, struct task_struct *p, int success) { + struct trace_array_cpu *data; int cpu = smp_processor_id(); unsigned long flags; long disabled; @@ -224,7 +224,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if (likely(!rt_task(p)) || + if ((wakeup_rt && !rt_task(p)) || p->prio >= wakeup_prio || p->prio >= current->prio) return; @@ -252,9 +252,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) local_save_flags(flags); - wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], - CALLER_ADDR1, CALLER_ADDR2, flags, pc); + data = wakeup_trace->data[wakeup_cpu]; + data->preempt_timestamp = ftrace_now(cpu); + tracing_sched_wakeup_trace(wakeup_trace, data, p, current, + flags, pc); + trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, + flags, pc); out_locked: __raw_spin_unlock(&wakeup_lock); @@ -262,12 +265,6 @@ out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } -/* - * save_tracer_enabled is used to save the state of the tracer_enabled - * variable when we disable it when we open a trace output file. - */ -static int save_tracer_enabled; - static void start_wakeup_tracer(struct trace_array *tr) { int ret; @@ -306,13 +303,10 @@ static void start_wakeup_tracer(struct trace_array *tr) register_ftrace_function(&trace_ops); - if (tracing_is_enabled()) { + if (tracing_is_enabled()) tracer_enabled = 1; - save_tracer_enabled = 1; - } else { + else tracer_enabled = 0; - save_tracer_enabled = 0; - } return; fail_deprobe_wake_new: @@ -324,14 +318,13 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); } -static int wakeup_tracer_init(struct trace_array *tr) +static int __wakeup_tracer_init(struct trace_array *tr) { tracing_max_latency = 0; wakeup_trace = tr; @@ -339,6 +332,18 @@ static int wakeup_tracer_init(struct trace_array *tr) return 0; } +static int wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + +static int wakeup_rt_tracer_init(struct trace_array *tr) +{ + wakeup_rt = 1; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { stop_wakeup_tracer(tr); @@ -350,28 +355,11 @@ static void wakeup_tracer_start(struct trace_array *tr) { wakeup_reset(tr); tracer_enabled = 1; - save_tracer_enabled = 1; } static void wakeup_tracer_stop(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; -} - -static void wakeup_tracer_open(struct trace_iterator *iter) -{ - /* stop the trace while dumping */ - tracer_enabled = 0; -} - -static void wakeup_tracer_close(struct trace_iterator *iter) -{ - /* forget about any processes we were recording */ - if (save_tracer_enabled) { - wakeup_reset(iter->tr); - tracer_enabled = 1; - } } static struct tracer wakeup_tracer __read_mostly = @@ -381,8 +369,19 @@ static struct tracer wakeup_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .open = wakeup_tracer_open, - .close = wakeup_tracer_close, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif +}; + +static struct tracer wakeup_rt_tracer __read_mostly = +{ + .name = "wakeup_rt", + .init = wakeup_rt_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -397,6 +396,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_rt_tracer); + if (ret) + return ret; + return 0; } device_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 88c8eb70f54..5013812578b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,7 +9,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: - case TRACE_CONT: case TRACE_STACK: case TRACE_PRINT: case TRACE_SPECIAL: diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c new file mode 100644 index 00000000000..eae9cef3929 --- /dev/null +++ b/kernel/trace/trace_stat.c @@ -0,0 +1,319 @@ +/* + * Infrastructure for statistic tracing (histogram output). + * + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + * Based on the code from trace_branch.c which is + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ + + +#include <linux/list.h> +#include <linux/debugfs.h> +#include "trace_stat.h" +#include "trace.h" + + +/* List of stat entries from a tracer */ +struct trace_stat_list { + struct list_head list; + void *stat; +}; + +/* A stat session is the stats output in one file */ +struct tracer_stat_session { + struct list_head session_list; + struct tracer_stat *ts; + struct list_head stat_list; + struct mutex stat_mutex; + struct dentry *file; +}; + +/* All of the sessions currently in use. Each stat file embeed one session */ +static LIST_HEAD(all_stat_sessions); +static DEFINE_MUTEX(all_stat_sessions_mutex); + +/* The root directory for all stat files */ +static struct dentry *stat_dir; + + +static void reset_stat_session(struct tracer_stat_session *session) +{ + struct trace_stat_list *node, *next; + + list_for_each_entry_safe(node, next, &session->stat_list, list) + kfree(node); + + INIT_LIST_HEAD(&session->stat_list); +} + +static void destroy_session(struct tracer_stat_session *session) +{ + debugfs_remove(session->file); + reset_stat_session(session); + mutex_destroy(&session->stat_mutex); + kfree(session); +} + +/* + * For tracers that don't provide a stat_cmp callback. + * This one will force an immediate insertion on tail of + * the list. + */ +static int dummy_cmp(void *p1, void *p2) +{ + return 1; +} + +/* + * Initialize the stat list at each trace_stat file opening. + * All of these copies and sorting are required on all opening + * since the stats could have changed between two file sessions. + */ +static int stat_seq_init(struct tracer_stat_session *session) +{ + struct trace_stat_list *iter_entry, *new_entry; + struct tracer_stat *ts = session->ts; + void *prev_stat; + int ret = 0; + int i; + + mutex_lock(&session->stat_mutex); + reset_stat_session(session); + + if (!ts->stat_cmp) + ts->stat_cmp = dummy_cmp; + + /* + * The first entry. Actually this is the second, but the first + * one (the stat_list head) is pointless. + */ + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + if (!new_entry) { + ret = -ENOMEM; + goto exit; + } + + INIT_LIST_HEAD(&new_entry->list); + + list_add(&new_entry->list, &session->stat_list); + + new_entry->stat = ts->stat_start(); + prev_stat = new_entry->stat; + + /* + * Iterate over the tracer stat entries and store them in a sorted + * list. + */ + for (i = 1; ; i++) { + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + if (!new_entry) { + ret = -ENOMEM; + goto exit_free_list; + } + + INIT_LIST_HEAD(&new_entry->list); + new_entry->stat = ts->stat_next(prev_stat, i); + + /* End of insertion */ + if (!new_entry->stat) + break; + + list_for_each_entry(iter_entry, &session->stat_list, list) { + + /* Insertion with a descendent sorting */ + if (ts->stat_cmp(new_entry->stat, + iter_entry->stat) > 0) { + + list_add_tail(&new_entry->list, + &iter_entry->list); + break; + + /* The current smaller value */ + } else if (list_is_last(&iter_entry->list, + &session->stat_list)) { + list_add(&new_entry->list, &iter_entry->list); + break; + } + } + + prev_stat = new_entry->stat; + } +exit: + mutex_unlock(&session->stat_mutex); + return ret; + +exit_free_list: + reset_stat_session(session); + mutex_unlock(&session->stat_mutex); + return ret; +} + + +static void *stat_seq_start(struct seq_file *s, loff_t *pos) +{ + struct tracer_stat_session *session = s->private; + + /* Prevent from tracer switch or stat_list modification */ + mutex_lock(&session->stat_mutex); + + /* If we are in the beginning of the file, print the headers */ + if (!*pos && session->ts->stat_headers) + session->ts->stat_headers(s); + + return seq_list_start(&session->stat_list, *pos); +} + +static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) +{ + struct tracer_stat_session *session = s->private; + + return seq_list_next(p, &session->stat_list, pos); +} + +static void stat_seq_stop(struct seq_file *s, void *p) +{ + struct tracer_stat_session *session = s->private; + mutex_unlock(&session->stat_mutex); +} + +static int stat_seq_show(struct seq_file *s, void *v) +{ + struct tracer_stat_session *session = s->private; + struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + + return session->ts->stat_show(s, l->stat); +} + +static const struct seq_operations trace_stat_seq_ops = { + .start = stat_seq_start, + .next = stat_seq_next, + .stop = stat_seq_stop, + .show = stat_seq_show +}; + +/* The session stat is refilled and resorted at each stat file opening */ +static int tracing_stat_open(struct inode *inode, struct file *file) +{ + int ret; + + struct tracer_stat_session *session = inode->i_private; + + ret = seq_open(file, &trace_stat_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = session; + ret = stat_seq_init(session); + } + + return ret; +} + +/* + * Avoid consuming memory with our now useless list. + */ +static int tracing_stat_release(struct inode *i, struct file *f) +{ + struct tracer_stat_session *session = i->i_private; + + mutex_lock(&session->stat_mutex); + reset_stat_session(session); + mutex_unlock(&session->stat_mutex); + + return 0; +} + +static const struct file_operations tracing_stat_fops = { + .open = tracing_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_stat_release +}; + +static int tracing_stat_init(void) +{ + struct dentry *d_tracing; + + d_tracing = tracing_init_dentry(); + + stat_dir = debugfs_create_dir("trace_stat", d_tracing); + if (!stat_dir) + pr_warning("Could not create debugfs " + "'trace_stat' entry\n"); + return 0; +} + +static int init_stat_file(struct tracer_stat_session *session) +{ + if (!stat_dir && tracing_stat_init()) + return -ENODEV; + + session->file = debugfs_create_file(session->ts->name, 0644, + stat_dir, + session, &tracing_stat_fops); + if (!session->file) + return -ENOMEM; + return 0; +} + +int register_stat_tracer(struct tracer_stat *trace) +{ + struct tracer_stat_session *session, *node, *tmp; + int ret; + + if (!trace) + return -EINVAL; + + if (!trace->stat_start || !trace->stat_next || !trace->stat_show) + return -EINVAL; + + /* Already registered? */ + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + mutex_unlock(&all_stat_sessions_mutex); + return -EINVAL; + } + } + mutex_unlock(&all_stat_sessions_mutex); + + /* Init the session */ + session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + if (!session) + return -ENOMEM; + + session->ts = trace; + INIT_LIST_HEAD(&session->session_list); + INIT_LIST_HEAD(&session->stat_list); + mutex_init(&session->stat_mutex); + session->file = NULL; + + ret = init_stat_file(session); + if (ret) { + destroy_session(session); + return ret; + } + + /* Register */ + mutex_lock(&all_stat_sessions_mutex); + list_add_tail(&session->session_list, &all_stat_sessions); + mutex_unlock(&all_stat_sessions_mutex); + + return 0; +} + +void unregister_stat_tracer(struct tracer_stat *trace) +{ + struct tracer_stat_session *node, *tmp; + + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + list_del(&node->session_list); + destroy_session(node); + break; + } + } + mutex_unlock(&all_stat_sessions_mutex); +} diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h new file mode 100644 index 00000000000..202274cf7f3 --- /dev/null +++ b/kernel/trace/trace_stat.h @@ -0,0 +1,31 @@ +#ifndef __TRACE_STAT_H +#define __TRACE_STAT_H + +#include <linux/seq_file.h> + +/* + * If you want to provide a stat file (one-shot statistics), fill + * an iterator with stat_start/stat_next and a stat_show callbacks. + * The others callbacks are optional. + */ +struct tracer_stat { + /* The name of your stat file */ + const char *name; + /* Iteration over statistic entries */ + void *(*stat_start)(void); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for stats sorting */ + int (*stat_cmp)(void *p1, void *p2); + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); +}; + +/* + * Destroy or create a stat file + */ +extern int register_stat_tracer(struct tracer_stat *trace); +extern void unregister_stat_tracer(struct tracer_stat *trace); + +#endif /* __TRACE_STAT_H */ diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c new file mode 100644 index 00000000000..4664990fe9c --- /dev/null +++ b/kernel/trace/trace_workqueue.c @@ -0,0 +1,281 @@ +/* + * Workqueue statistical tracer. + * + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + */ + + +#include <trace/workqueue.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include "trace_stat.h" +#include "trace.h" + + +/* A cpu workqueue thread */ +struct cpu_workqueue_stats { + struct list_head list; +/* Useful to know if we print the cpu headers */ + bool first_entry; + int cpu; + pid_t pid; +/* Can be inserted from interrupt or user context, need to be atomic */ + atomic_t inserted; +/* + * Don't need to be atomic, works are serialized in a single workqueue thread + * on a single CPU. + */ + unsigned int executed; +}; + +/* List of workqueue threads on one cpu */ +struct workqueue_global_stats { + struct list_head list; + spinlock_t lock; +}; + +/* Don't need a global lock because allocated before the workqueues, and + * never freed. + */ +static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); +#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) + +/* Insertion of a work */ +static void +probe_workqueue_insertion(struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, + list) { + if (node->pid == wq_thread->pid) { + atomic_inc(&node->inserted); + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); +} + +/* Execution of a work */ +static void +probe_workqueue_execution(struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, + list) { + if (node->pid == wq_thread->pid) { + node->executed++; + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); +} + +/* Creation of a cpu workqueue thread */ +static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) +{ + struct cpu_workqueue_stats *cws; + unsigned long flags; + + WARN_ON(cpu < 0 || cpu >= num_possible_cpus()); + + /* Workqueues are sometimes created in atomic context */ + cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); + if (!cws) { + pr_warning("trace_workqueue: not enough memory\n"); + return; + } + tracing_record_cmdline(wq_thread); + + INIT_LIST_HEAD(&cws->list); + cws->cpu = cpu; + + cws->pid = wq_thread->pid; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (list_empty(&workqueue_cpu_stat(cpu)->list)) + cws->first_entry = true; + list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); +} + +/* Destruction of a cpu workqueue thread */ +static void probe_workqueue_destruction(struct task_struct *wq_thread) +{ + /* Workqueue only execute on one cpu */ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, + list) { + if (node->pid == wq_thread->pid) { + list_del(&node->list); + kfree(node); + goto found; + } + } + + pr_debug("trace_workqueue: don't find workqueue to destroy\n"); +found: + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + +} + +static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) +{ + unsigned long flags; + struct cpu_workqueue_stats *ret = NULL; + + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + ret = list_entry(workqueue_cpu_stat(cpu)->list.next, + struct cpu_workqueue_stats, list); + + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + + return ret; +} + +static void *workqueue_stat_start(void) +{ + int cpu; + void *ret = NULL; + + for_each_possible_cpu(cpu) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; +} + +static void *workqueue_stat_next(void *prev, int idx) +{ + struct cpu_workqueue_stats *prev_cws = prev; + int cpu = prev_cws->cpu; + unsigned long flags; + void *ret = NULL; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + for (++cpu ; cpu < num_possible_cpus(); cpu++) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; + } + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + + return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, + list); +} + +static int workqueue_stat_show(struct seq_file *s, void *p) +{ + struct cpu_workqueue_stats *cws = p; + unsigned long flags; + int cpu = cws->cpu; + + seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, + atomic_read(&cws->inserted), + cws->executed, + trace_find_cmdline(cws->pid)); + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (&cws->list == workqueue_cpu_stat(cpu)->list.next) + seq_printf(s, "\n"); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + + return 0; +} + +static int workqueue_stat_headers(struct seq_file *s) +{ + seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); + seq_printf(s, "# | | | |\n\n"); + return 0; +} + +struct tracer_stat workqueue_stats __read_mostly = { + .name = "workqueues", + .stat_start = workqueue_stat_start, + .stat_next = workqueue_stat_next, + .stat_show = workqueue_stat_show, + .stat_headers = workqueue_stat_headers +}; + + +int __init stat_workqueue_init(void) +{ + if (register_stat_tracer(&workqueue_stats)) { + pr_warning("Unable to register workqueue stat tracer\n"); + return 1; + } + + return 0; +} +fs_initcall(stat_workqueue_init); + +/* + * Workqueues are created very early, just after pre-smp initcalls. + * So we must register our tracepoints at this stage. + */ +int __init trace_workqueue_early_init(void) +{ + int ret, cpu; + + ret = register_trace_workqueue_insertion(probe_workqueue_insertion); + if (ret) + goto out; + + ret = register_trace_workqueue_execution(probe_workqueue_execution); + if (ret) + goto no_insertion; + + ret = register_trace_workqueue_creation(probe_workqueue_creation); + if (ret) + goto no_execution; + + ret = register_trace_workqueue_destruction(probe_workqueue_destruction); + if (ret) + goto no_creation; + + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + + return 0; + +no_creation: + unregister_trace_workqueue_creation(probe_workqueue_creation); +no_execution: + unregister_trace_workqueue_execution(probe_workqueue_execution); +no_insertion: + unregister_trace_workqueue_insertion(probe_workqueue_insertion); +out: + pr_warning("trace_workqueue: unable to trace workqueues\n"); + + return 1; +} +early_initcall(trace_workqueue_early_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1f0c509b40d..e53ee18ef43 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,6 +33,7 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> +#include <trace/workqueue.h> /* * The per-CPU workqueue (if single thread, we always use the first @@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } +DEFINE_TRACE(workqueue_insertion); + static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { + trace_workqueue_insertion(cwq->thread, work); + set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the @@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +DEFINE_TRACE(workqueue_execution); + static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) */ struct lockdep_map lockdep_map = work->lockdep_map; #endif - + trace_workqueue_execution(cwq->thread, work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } +DEFINE_TRACE(workqueue_creation); + static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); cwq->thread = p; + trace_workqueue_creation(cwq->thread, cpu); + return 0; } @@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); +DEFINE_TRACE(workqueue_destruction); + static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* @@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) * checks list_empty(), and a "normal" queue_work() can't use * a dead CPU. */ + trace_workqueue_destruction(cwq->thread); kthread_stop(cwq->thread); cwq->thread = NULL; } diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d5..dae716b3291 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,6 +102,7 @@ #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> +#include <trace/kmemtrace.h> #include <linux/rcupdate.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#ifdef CONFIG_KMEMTRACE +size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return cachep->buffer_size; +} +EXPORT_SYMBOL(slab_buffer_size); +#endif + /* * Do not go above this order unless 0 objects fit into the slab. */ @@ -3550,10 +3559,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, flags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +{ + return __cache_alloc(cachep, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + /** * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against @@ -3598,23 +3620,47 @@ out: #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, + flags, nodeid); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; + void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node(cachep, flags, node); + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags, node); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3647,6 +3693,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) { struct kmem_cache *cachep; + void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. @@ -3656,11 +3703,17 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return __cache_alloc(cachep, flags, caller); + ret = __cache_alloc(cachep, flags, caller); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); @@ -3699,6 +3752,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp); } EXPORT_SYMBOL(kmem_cache_free); @@ -3725,6 +3780,8 @@ void kfree(const void *objp) debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp); } EXPORT_SYMBOL(kfree); diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed..4d1c0fc33b6 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,6 +65,7 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> +#include <trace/kmemtrace.h> #include <asm/atomic.h> /* @@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + void *ret; if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; m = slob_alloc(size + align, gfp, align, node); + if (!m) return NULL; *m = size; - return (void *)m + align; + ret = (void *)m + align; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, size + align, gfp, node); } else { - void *ret; + unsigned int order = get_order(size); - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_page(gfp | __GFP_COMP, order, node); if (ret) { struct page *page; page = virt_to_page(ret); page->private = size; } - return ret; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << order, gfp, node); } + + return ret; } EXPORT_SYMBOL(__kmalloc_node); @@ -501,6 +513,8 @@ void kfree(const void *block) slob_free(m, *m + align); } else put_page(&sp->page); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block); } EXPORT_SYMBOL(kfree); @@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; - if (c->size < PAGE_SIZE) + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - else + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { b = slob_new_page(flags, get_order(c->size), node); + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + PAGE_SIZE << get_order(c->size), + flags, node); + } if (c->ctor) c->ctor(b); @@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } else { __kmem_cache_free(b, c->size); } + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a2..0343b3b8898 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <trace/kmemtrace.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> @@ -1623,18 +1624,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1742,6 +1771,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); slab_free(s, page, x, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -2657,6 +2688,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, flags); @@ -2666,7 +2698,12 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, _RET_IP_); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -2685,16 +2722,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + if (unlikely(size > PAGE_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc(s, flags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2752,6 +2803,8 @@ void kfree(const void *x) return; } slab_free(page->slab, page, object, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x); } EXPORT_SYMBOL(kfree); @@ -3221,6 +3274,7 @@ static struct notifier_block __cpuinitdata slab_notifier = { void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, gfpflags); @@ -3230,13 +3284,20 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size, + s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large_node(size, gfpflags, node); @@ -3246,7 +3307,13 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc(s, gfpflags, node, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret, + size, s->size, gfpflags, node); + + return ret; } #ifdef CONFIG_SLUB_DEBUG diff --git a/scripts/Makefile.build b/scripts/Makefile.build index c7de8b39fcf..39a9642927d 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -112,13 +112,13 @@ endif # --------------------------------------------------------------------------- # Default is built-in, unless we know otherwise -modkern_cflags := $(CFLAGS_KERNEL) +modkern_cflags = $(if $(part-of-module), $(CFLAGS_MODULE), $(CFLAGS_KERNEL)) quiet_modtag := $(empty) $(empty) -$(real-objs-m) : modkern_cflags := $(CFLAGS_MODULE) -$(real-objs-m:.o=.i) : modkern_cflags := $(CFLAGS_MODULE) -$(real-objs-m:.o=.s) : modkern_cflags := $(CFLAGS_MODULE) -$(real-objs-m:.o=.lst): modkern_cflags := $(CFLAGS_MODULE) +$(real-objs-m) : part-of-module := y +$(real-objs-m:.o=.i) : part-of-module := y +$(real-objs-m:.o=.s) : part-of-module := y +$(real-objs-m:.o=.lst): part-of-module := y $(real-objs-m) : quiet_modtag := [M] $(real-objs-m:.o=.i) : quiet_modtag := [M] @@ -205,7 +205,8 @@ endif ifdef CONFIG_FTRACE_MCOUNT_RECORD cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_64BIT),64,32)" \ - "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)"; + "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ + "$(if $(part-of-module),1,0)" "$(@)"; endif define rule_cc_o_c diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index fe831412bea..409596eca12 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -100,14 +100,19 @@ $P =~ s@.*/@@g; my $V = '0.1'; -if ($#ARGV < 6) { - print "usage: $P arch objdump objcopy cc ld nm rm mv inputfile\n"; +if ($#ARGV < 7) { + print "usage: $P arch bits objdump objcopy cc ld nm rm mv is_module inputfile\n"; print "version: $V\n"; exit(1); } my ($arch, $bits, $objdump, $objcopy, $cc, - $ld, $nm, $rm, $mv, $inputfile) = @ARGV; + $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV; + +# This file refers to mcount and shouldn't be ftraced, so lets' ignore it +if ($inputfile eq "kernel/trace/ftrace.o") { + exit(0); +} # Acceptable sections to record. my %text_sections = ( @@ -201,6 +206,13 @@ if ($arch eq "x86_64") { $alignment = 2; $section_type = '%progbits'; +} elsif ($arch eq "ia64") { + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; + $type = "data8"; + + if ($is_module eq "0") { + $cc .= " -mconstant-gp"; + } } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } @@ -263,7 +275,6 @@ if (!$found_version) { "\tDisabling local function references.\n"; } - # # Step 1: find all the local (static functions) and weak symbols. # 't' is local, 'w/W' is weak (we never use a weak function) @@ -331,13 +342,16 @@ sub update_funcs # # Step 2: find the sections and mcount call sites # -open(IN, "$objdump -dr $inputfile|") || die "error running $objdump"; +open(IN, "$objdump -hdr $inputfile|") || die "error running $objdump"; my $text; +my $read_headers = 1; + while (<IN>) { # is it a section? if (/$section_regex/) { + $read_headers = 0; # Only record text sections that we know are safe if (defined($text_sections{$1})) { @@ -371,6 +385,19 @@ while (<IN>) { $ref_func = $text; } } + } elsif ($read_headers && /$mcount_section/) { + # + # Somehow the make process can execute this script on an + # object twice. If it does, we would duplicate the mcount + # section and it will cause the function tracer self test + # to fail. Check if the mcount section exists, and if it does, + # warn and exit. + # + print STDERR "ERROR: $mcount_section already in $inputfile\n" . + "\tThis may be an indication that your build is corrupted.\n" . + "\tDelete $inputfile and try again. If the same object file\n" . + "\tstill causes an issue, then disable CONFIG_DYNAMIC_FTRACE.\n"; + exit(-1); } # is this a call site to mcount? If so, record it to print later |