From 771e03842a9e98a1c2013ca1ed8bb2793488f3e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 30 Nov 2012 10:41:57 -0500 Subject: ring-buffer: Remove unnecessary recusive call in rb_advance_iter() The original ring-buffer code had special checks at the start of rb_advance_iter() and instead of repeating them again at the end of the function if a certain condition existed, I just did a recursive call to rb_advance_iter() because the special condition would cause rb_advance_iter() to return early (after the checks). But as things have changed, the special checks no longer exist and the only thing done for the special_condition is to call rb_inc_iter() and return. Instead of doing a confusing recursive call, just call rb_inc_iter instead. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedc..6ff9cc4658e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3425,7 +3425,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* check for end of page padding */ if ((iter->head >= rb_page_size(iter->head_page)) && (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); + rb_inc_iter(iter); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) -- cgit v1.2.3-70-g09d2 From d8a0349c0cea477322c66ea9362f10c62fad5f62 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 13 Nov 2012 09:53:04 +0800 Subject: tracing: Use this_cpu_ptr per-cpu helper typeof(&buffer) is a pointer to array of 1024 char, or char (*)[1024]. But, typeof(&buffer[0]) is a pointer to char which match the return type of get_trace_buf(). As well-known, the value of &buffer is equal to &buffer[0]. so return this_cpu_ptr(&percpu_buffer->buffer[0]) can avoid type cast. Link: http://lkml.kernel.org/r/50A1A800.3020102@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 2 +- kernel/trace/trace.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741..71259e2b6b6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) return; local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d2..f8b7c626f3f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1517,7 +1517,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer; static char *get_trace_buf(void) { struct trace_buffer_struct *percpu_buffer; - struct trace_buffer_struct *buffer; /* * If we have allocated per cpu buffers, then we do not @@ -1535,9 +1534,7 @@ static char *get_trace_buf(void) if (!percpu_buffer) return NULL; - buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); - - return buffer->buffer; + return this_cpu_ptr(&percpu_buffer->buffer[0]); } static int alloc_percpu_trace_buffer(void) -- cgit v1.2.3-70-g09d2 From d24d7dbf3cc49b00a152e55e24f0eeb173c7a971 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 18 Jul 2012 18:16:44 +0800 Subject: tracing: Verify target file before registering a uprobe event Without this patch, we can register a uprobe event for a directory. Enabling such a uprobe event would anyway fail. Example: $ echo 'p /bin:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events However dirctories cannot be valid targets for uprobe. Hence verify if the target is a regular file during the probe registration. Link: http://lkml.kernel.org/r/20130103004212.690763002@goodmis.org Cc: Namhyung Kim Signed-off-by: Jovi Zhang Acked-by: Srikar Dronamraju [ cleaned up whitespace and removed redundant IS_DIR() check ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67f..87b6db4ccbc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -258,6 +258,10 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; inode = igrab(path.dentry->d_inode); + if (!S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } argc -= 2; argv += 2; @@ -356,7 +360,7 @@ fail_address_parse: if (inode) iput(inode); - pr_info("Failed to parse address.\n"); + pr_info("Failed to parse address or file.\n"); return ret; } -- cgit v1.2.3-70-g09d2 From 6aea49cb5f3001a8275bf9c9f586ec3eb39af194 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 21 Nov 2012 15:13:47 +0800 Subject: tracing/syscalls: Make local functions static Some functions in the syscall tracing is used only locally to the file, but they are labeled global. Convert them to static functions. Signed-off-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c..5329e13e74a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -enum print_line_t +static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -130,7 +130,7 @@ end: return TRACE_TYPE_HANDLED; } -enum print_line_t +static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; @@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; @@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; @@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; @@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; -- cgit v1.2.3-70-g09d2 From a54164114b96b4693b42cdb553260eec41ea4393 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 19 Dec 2012 16:02:34 +0900 Subject: tracing: Add checks if tr->buffer is NULL in tracing_reset{_online_cpus} max_tr->buffer could be NULL in the tracing_reset{_online_cpus}. In this case, a NULL pointer dereference happens, so we should return immediately from these functions. Note, the current code does not call tracing_reset*() with max_tr when its buffer is NULL, but future code will. This patch is needed to prevent the future code from crashing. Link: http://lkml.kernel.org/r/20121219070234.31200.93863.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8b7c626f3f..72b171b90e5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -922,6 +922,9 @@ void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ @@ -936,6 +939,9 @@ void tracing_reset_online_cpus(struct trace_array *tr) struct ring_buffer *buffer = tr->buffer; int cpu; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ -- cgit v1.2.3-70-g09d2 From 84c6cf0db6a00601eb43cfc08244a398ffb0894c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Dec 2012 21:43:52 -0500 Subject: tracing: Remove unneeded check of max_tr->buffer before tracing_reset There's now a check in tracing_reset_online_cpus() if the buffer is allocated or NULL. No need to do a check before calling it with max_tr. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 72b171b90e5..d62248dfda7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4040,8 +4040,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&global_trace); - if (max_tr.buffer) - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&max_tr); mutex_unlock(&trace_types_lock); -- cgit v1.2.3-70-g09d2 From 8741db532e86da2e54f05be751bfe1922ca63d57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Jan 2013 10:49:37 -0500 Subject: tracing/fgraph: Add max_graph_depth to limit function_graph depth Add the file max_graph_depth to the debug tracing directory that lets the user define the depth of the function graph. A very useful operation is to set the depth to 1. Then it traces only the first function that is called when entering the kernel. This can be used to determine what system operations interrupt a process. For example, to work on NOHZ processes (single tasks running without a timer tick), if any interrupt goes off and preempts that task, this code will show it happening. # cd /sys/kernel/debug/tracing # echo 1 > max_graph_depth # echo function_graph > current_tracer # cat per_cpu/cpu//trace Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 60 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7..7008d2e13cf 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; + static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -250,8 +252,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || + (max_depth && trace->depth >= max_depth)) return 0; local_irq_save(flags); @@ -1457,6 +1460,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); -- cgit v1.2.3-70-g09d2 From 06aeaaeabf69da4a3e86df532425640f51b01cef Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:17 +0900 Subject: ftrace: Move ARCH_SUPPORTS_FTRACE_SAVE_REGS in Kconfig Move SAVE_REGS support flag into Kconfig and rename it to CONFIG_DYNAMIC_FTRACE_WITH_REGS. This also introduces CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS which indicates the architecture depending part of ftrace has a code that saves full registers. On the other hand, CONFIG_DYNAMIC_FTRACE_WITH_REGS indicates the code is enabled. Link: http://lkml.kernel.org/r/20120928081516.3560.72534.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 1 - include/linux/ftrace.h | 6 +++--- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace_selftest.c | 2 +- 6 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 79795af5981..996ccecc694 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -44,6 +44,7 @@ config X86 select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9a25b522d37..86cb51e1ca9 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -44,7 +44,6 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 -#define ARCH_SUPPORTS_FTRACE_SAVE_REGS #endif #ifndef __ASSEMBLY__ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 92691d85c32..e5ca8ef50e9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -74,7 +74,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * SAVE_REGS - The ftrace_ops wants regs saved at each function called * and passed to the callback. If this flag is set, but the * architecture does not support passing regs - * (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the + * (CONFIG_DYNAMIC_FTRACE_WITH_REGS is not defined), then the * ftrace_ops will fail to register, unless the next flag * is set. * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the @@ -418,7 +418,7 @@ void ftrace_modify_all_code(int command); #endif #ifndef FTRACE_REGS_ADDR -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS # define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller) #else # define FTRACE_REGS_ADDR FTRACE_ADDR @@ -480,7 +480,7 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS /** * ftrace_modify_call - convert from one addr to another (no nop) * @rec: the mcount call site record diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485..cdc9d284d24 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE help See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -434,6 +437,11 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 41473b4ad7a..6e34dc162fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -337,7 +337,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; -#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. @@ -4143,8 +4143,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * Archs are to support both the regs and ftrace_ops at the same time. * If they support ftrace_ops, it is assumed they support regs. * If call backs want to use regs, they must either check for regs - * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. - * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORT_FTARCE_OPS. */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a81..6c62d58d8e8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -568,7 +568,7 @@ trace_selftest_function_regs(void) int ret; int supported = 0; -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS supported = 1; #endif -- cgit v1.2.3-70-g09d2 From e7dbfe349d12eabb7783b117e0c115f6f3d9ef9e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:20 +0900 Subject: kprobes/x86: Move ftrace-based kprobe code into kprobes-ftrace.c Split ftrace-based kprobes code from kprobes, and introduce CONFIG_(HAVE_)KPROBES_ON_FTRACE Kconfig flags. For the cleanup reason, this also moves kprobe_ftrace check into skip_singlestep. Link: http://lkml.kernel.org/r/20120928081520.3560.25624.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/Kconfig | 12 ++++++ arch/x86/Kconfig | 1 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kprobes-common.h | 11 +++++ arch/x86/kernel/kprobes-ftrace.c | 93 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/kprobes.c | 70 +----------------------------- include/linux/kprobes.h | 12 +----- kernel/kprobes.c | 8 ++-- 8 files changed, 125 insertions(+), 83 deletions(-) create mode 100644 arch/x86/kernel/kprobes-ftrace.c (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 7f8f281f258..97fb7d0365d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -76,6 +76,15 @@ config OPTPROBES depends on KPROBES && HAVE_OPTPROBES depends on !PREEMPT +config KPROBES_ON_FTRACE + def_bool y + depends on KPROBES && HAVE_KPROBES_ON_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS + help + If function tracer is enabled and the arch supports full + passing of pt_regs to function tracing, then kprobes can + optimize on top of function tracing. + config UPROBES bool "Transparent user-space probes (EXPERIMENTAL)" depends on UPROBE_EVENT && PERF_EVENTS @@ -158,6 +167,9 @@ config HAVE_KRETPROBES config HAVE_OPTPROBES bool +config HAVE_KPROBES_ON_FTRACE + bool + config HAVE_NMI_WATCHDOG bool # diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 996ccecc694..be8b2b3ab97 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -40,6 +40,7 @@ config X86 select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES select HAVE_OPTPROBES + select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34e923a5376..cc5d31f8830 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_OPTPROBES) += kprobes-opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h index 3230b68ef29..2e9d4b5af03 100644 --- a/arch/x86/kernel/kprobes-common.h +++ b/arch/x86/kernel/kprobes-common.h @@ -99,4 +99,15 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig return addr; } #endif + +#ifdef CONFIG_KPROBES_ON_FTRACE +extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); +#else +static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + return 0; +} +#endif #endif diff --git a/arch/x86/kernel/kprobes-ftrace.c b/arch/x86/kernel/kprobes-ftrace.c new file mode 100644 index 00000000000..70a81c7aa0a --- /dev/null +++ b/arch/x86/kernel/kprobes-ftrace.c @@ -0,0 +1,93 @@ +/* + * Dynamic Ftrace based Kprobes Optimization + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include + +#include "kprobes-common.h" + +static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + return 1; +} + +int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if (kprobe_ftrace(p)) + return __skip_singlestep(p, regs, kcb); + else + return 0; +} + +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) + __skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ + } +end: + local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 57916c0d3cf..18114bfb10f 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,23 +541,6 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } -#ifdef KPROBES_CAN_USE_FTRACE -static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); -} -#endif - /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -616,13 +599,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } else if (kprobe_running()) { p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { -#ifdef KPROBES_CAN_USE_FTRACE - if (kprobe_ftrace(p)) { - skip_singlestep(p, regs, kcb); - return 1; - } -#endif - setup_singlestep(p, regs, kcb, 0); + if (!skip_singlestep(p, regs, kcb)) + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -1075,50 +1053,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -#ifdef KPROBES_CAN_USE_FTRACE -/* Ftrace callback handler for kprobes */ -void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) -{ - struct kprobe *p; - struct kprobe_ctlblk *kcb; - unsigned long flags; - - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - - p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) - goto end; - - kcb = get_kprobe_ctlblk(); - if (kprobe_running()) { - kprobes_inc_nmissed_count(p); - } else { - /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); - - __this_cpu_write(current_kprobe, p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) - skip_singlestep(p, regs, kcb); - /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe. - */ - } -end: - local_irq_restore(flags); -} - -int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) -{ - p->ainsn.insn = NULL; - p->ainsn.boostable = -1; - return 0; -} -#endif - int __init arch_init_kprobes(void) { return arch_init_optprobes(); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 23755ba42ab..4b6ef4d33cc 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -49,16 +49,6 @@ #define KPROBE_REENTER 0x00000004 #define KPROBE_HIT_SSDONE 0x00000008 -/* - * If function tracer is enabled and the arch supports full - * passing of pt_regs to function tracing, then kprobes can - * optimize on top of function tracing. - */ -#if defined(CONFIG_FUNCTION_TRACER) && defined(ARCH_SUPPORTS_FTRACE_SAVE_REGS) \ - && defined(ARCH_SUPPORTS_KPROBES_ON_FTRACE) -# define KPROBES_CAN_USE_FTRACE -#endif - /* Attach to insert probes on any functions which should be ignored*/ #define __kprobes __attribute__((__section__(".kprobes.text"))) @@ -316,7 +306,7 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, #endif #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa40..f423c3ef4a8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -919,7 +919,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, @@ -964,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); } -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) do {} while (0) #define disarm_kprobe_ftrace(p) do {} while (0) @@ -1414,12 +1414,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; #endif } -- cgit v1.2.3-70-g09d2 From b000c8065a92b0fe0e1694f41b2c8d8ba7b7b1ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Jan 2013 10:31:20 -0500 Subject: tracing: Remove the extra 4 bytes of padding in events Due to a userspace issue with PowerTop v2beta, which hardcoded the offset of event fields that it was using, it broke when we removed the Big Kernel Lock counter from the event header. (commit e6e1e2593 "tracing: Remove lock_depth from event entry") Because this broke userspace, it was determined that we must keep those 4 bytes around. (commit a3a4a5acd "Regression: partial revert "tracing: Remove lock_depth from event entry"") This unfortunately wastes space in the ring buffer. 4 bytes per event, where a lot of events are just 24 bytes. That's 16% of the buffer wasted. A million events will add 4 megs of white space into the buffer. It was later noticed that PowerTop v2beta could not work on systems where the kernel was 64 bit but the userspace was 32 bits. The reason was because the offsets are different between the two and the hard coded offset of one would not work with the other. With PowerTop v2 final, it implemented the same interface that both perf and trace-cmd use. That is, it reads the format file of the event to find the offsets of the fields it needs. This fixes the problem with running powertop on a 32 bit userspace running on a 64 bit kernel. It also no longer requires the 4 byte padding. As PowerTop v2 has been out for a while, and is included in all major distributions, it is time that we can safely remove the 4 bytes of padding. Users of PowerTop v2beta should upgrade to PowerTop v2 final. Cc: Linus Torvalds Acked-by: Arjan van de Ven Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.c | 1 - kernel/trace/trace_events.c | 1 - 3 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 43ef8b6cce7..6f8d0b77006 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -49,7 +49,6 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; - int padding; }; #define FTRACE_MAX_EVENT \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d62248dfda7..a387bd271e7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1173,7 +1173,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b94..57e9b284250 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, padding); return ret; } -- cgit v1.2.3-70-g09d2 From 0a71e4c6d749d06f52e75a406fc9046924fcfcc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 12:06:56 -0500 Subject: tracing: Remove trace.h header from trace_clock.c As trace_clock is used by other things besides tracing, and it does not require anything from trace.h, it is best not to include the header file in trace_clock.c. Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cb..22b638b28e4 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@ #include #include -#include "trace.h" - /* * trace_clock_local(): the simplest and least coherent tracing clock. * -- cgit v1.2.3-70-g09d2 From 34600f0e9c33c9cd48ae87448205f51332b7d5a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 13:35:11 -0500 Subject: tracing: Fix race with max_tr and changing tracers There's a race condition between the setting of a new tracer and the update of the max trace buffers (the swap). When a new tracer is added, it sets current_trace to nop_trace before disabling the old tracer. At this moment, if the old tracer uses update_max_tr(), the update may trigger the warning against !current_trace->use_max-tr, as nop_trace doesn't have that set. As update_max_tr() requires that interrupts be disabled, we can add a check to see if current_trace == nop_trace and bail if it does. Then when disabling the current_trace, set it to nop_trace and run synchronize_sched(). This will make sure all calls to update_max_tr() have completed (it was called with interrupts disabled). As a clean up, this commit also removes shrinking and recreating the max_tr buffer if the old and new tracers both have use_max_tr set. The old way use to always shrink the buffer, and then expand it for the next tracer. This is a waste of time. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a387bd271e7..d2a658349ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -709,10 +709,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + + /* If we disabled the tracer, stop now */ + if (current_trace == &nop_trace) return; - } + + if (WARN_ON_ONCE(!current_trace->use_max_tr)) + return; + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -3185,6 +3189,7 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; + bool had_max_tr; int ret = 0; mutex_lock(&trace_types_lock); @@ -3211,7 +3216,19 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { + + had_max_tr = current_trace && current_trace->use_max_tr; + current_trace = &nop_trace; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_sched(); /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and @@ -3222,10 +3239,8 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = &nop_trace; - topts = create_trace_option_files(t); - if (t->use_max_tr) { + if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ ret = resize_buffer_duplicate_size(&max_tr, &global_trace, RING_BUFFER_ALL_CPUS); -- cgit v1.2.3-70-g09d2 From 05cbbf643b8eea1be21082c53cdb856d1dc6d765 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 23:35:11 -0500 Subject: tracing: Fix selftest function recursion accounting The test that checks function recursion does things differently if the arch does not support all ftrace features. But that really doesn't make a difference with how the test runs, and either way the count variable should be 2 at the end. Currently the test wrongly fails for archs that don't support all the ftrace features. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 6c62d58d8e8..adb008a0136 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -452,7 +452,6 @@ trace_selftest_function_recursion(void) char *func_name; int len; int ret; - int cnt; /* The previous test PASSED */ pr_cont("PASSED\n"); @@ -510,19 +509,10 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_recsafe_probe); - /* - * If arch supports all ftrace features, and no other task - * was on the list, we should be fine. - */ - if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) - cnt = 2; /* Should have recursed */ - else - cnt = 1; - ret = -1; - if (trace_selftest_recursion_cnt != cnt) { - pr_cont("*callback not called expected %d times (%d)* ", - cnt, trace_selftest_recursion_cnt); + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); goto out; } -- cgit v1.2.3-70-g09d2 From 6350379452ccaeaa71734adf57dec2ebc9207849 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 16:58:56 -0400 Subject: ftrace: Fix global function tracers that are not recursion safe If one of the function tracers set by the global ops is not recursion safe, it can still be called directly without the added recursion supplied by the ftrace infrastructure. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6e34dc162fe..789cbec24e8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -221,10 +221,24 @@ static void update_global_ops(void) * registered callers. */ if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) + ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; - else + /* + * As we are calling the function directly. + * If it does not have recursion protection, + * the function_trace_op needs to be updated + * accordingly. + */ + if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + else + global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; + } else { func = ftrace_global_list_func; + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + } + /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { -- cgit v1.2.3-70-g09d2 From 9640388b63556b4cfecbb5aaf91a5c99d272f429 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:01:20 -0400 Subject: ftrace: Fix function tracing recursion self test The function tracing recursion self test should not crash the machine if the resursion test fails. If it detects that the function tracing is recursing when it should not be, then bail, don't go into an infinite recursive loop. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adb008a0136..51c819c12c2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip, * The ftrace infrastructure should provide the recursion * protection. If not, this will crash the kernel! */ - trace_selftest_recursion_cnt++; + if (trace_selftest_recursion_cnt++ > 10) + return; DYN_FTRACE_TEST_NAME(); } -- cgit v1.2.3-70-g09d2 From 0a016409e42f273415f8225ddf2c58eb2df88034 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:03:03 -0400 Subject: ftrace: Optimize the function tracer list loop There is lots of places that perform: op = rcu_dereference_raw(ftrace_control_list); while (op != &ftrace_list_end) { Add a helper macro to do this, and also optimize for a single entity. That is, gcc will optimize a loop for either no iterations or more than one iteration. But usually only a single callback is registered to the function tracer, thus the optimized case should be a single pass. to do this we now do: op = rcu_dereference_raw(list); do { [...] } while (likely(op = rcu_dereference_raw((op)->next)) && unlikely((op) != &ftrace_list_end)); An op is always registered (ftrace_list_end when no callbacks is registered), thus when a single callback is registered, the link list looks like: top => callback => ftrace_list_end => NULL. The likely(op = op->next) still must be performed due to the race of removing the callback, where the first op assignment could equal ftrace_list_end. In that case, the op->next would be NULL. But this is unlikely (only happens in a race condition when removing the callback). But it is very likely that the next op would be ftrace_list_end, unless more than one callback has been registered. This tells gcc what the most common case is and makes the fast path with the least amount of branches. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 789cbec24e8..1330969d844 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -132,15 +152,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -149,11 +160,9 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, return; trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); /*see above*/ - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_GLOBAL_BIT); } @@ -4104,14 +4113,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); - op = rcu_dereference_raw(ftrace_control_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_control_list) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4139,12 +4145,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } -- cgit v1.2.3-70-g09d2 From c29f122cd7fc178b72b1335b1fce0dff2e5c0f5d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:17:59 -0400 Subject: ftrace: Add context level recursion bit checking Currently for recursion checking in the function tracer, ftrace tests a task_struct bit to determine if the function tracer had recursed or not. If it has, then it will will return without going further. But this leads to races. If an interrupt came in after the bit was set, the functions being traced would see that bit set and think that the function tracer recursed on itself, and would return. Instead add a bit for each context (normal, softirq, irq and nmi). A check of which context the task is in is made before testing the associated bit. Now if an interrupt preempts the function tracer after the previous context has been set, the interrupt functions can still be traced. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 12 +++++++++--- 2 files changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1330969d844..639b6ab1f04 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -156,14 +156,27 @@ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_GLOBAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_GLOBAL_IRQ_BIT; + else + bit = TRACE_GLOBAL_SIRQ_BIT; + } else + bit = TRACE_GLOBAL_BIT; + + if (unlikely(trace_recursion_test(bit))) return; - trace_recursion_set(TRACE_GLOBAL_BIT); + trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(TRACE_GLOBAL_BIT); + trace_recursion_clear(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4132,14 +4145,27 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + unsigned int bit; if (function_trace_stop) return; - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) - return; + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_INTERNAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_INTERNAL_IRQ_BIT; + else + bit = TRACE_INTERNAL_SIRQ_BIT; + } else + bit = TRACE_INTERNAL_BIT; + + if (unlikely(trace_recursion_test(bit))) + return; + + trace_recursion_set(bit); - trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). @@ -4150,7 +4176,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); + trace_recursion_clear(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902..fe6ccff9cc7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -299,8 +299,14 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) -#define TRACE_CONTROL_BIT (1<<13) +#define TRACE_INTERNAL_NMI_BIT (1<<12) +#define TRACE_INTERNAL_IRQ_BIT (1<<13) +#define TRACE_INTERNAL_SIRQ_BIT (1<<14) +#define TRACE_GLOBAL_BIT (1<<15) +#define TRACE_GLOBAL_NMI_BIT (1<<16) +#define TRACE_GLOBAL_IRQ_BIT (1<<17) +#define TRACE_GLOBAL_SIRQ_BIT (1<<18) +#define TRACE_CONTROL_BIT (1<<19) /* * Abuse of the trace_recursion. @@ -309,7 +315,7 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<13) +#define TRACE_IRQ_BIT (1<<20) #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -- cgit v1.2.3-70-g09d2 From e46cbf75c621725964fe1f6e7013e8bcd86a0e3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:32:25 -0400 Subject: tracing: Make the trace recursion bits into enums Convert the bits into enums which makes the code a little easier to maintain. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fe6ccff9cc7..5a095d6f088 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -298,15 +298,18 @@ struct tracer { #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_INTERNAL_NMI_BIT (1<<12) -#define TRACE_INTERNAL_IRQ_BIT (1<<13) -#define TRACE_INTERNAL_SIRQ_BIT (1<<14) -#define TRACE_GLOBAL_BIT (1<<15) -#define TRACE_GLOBAL_NMI_BIT (1<<16) -#define TRACE_GLOBAL_IRQ_BIT (1<<17) -#define TRACE_GLOBAL_SIRQ_BIT (1<<18) -#define TRACE_CONTROL_BIT (1<<19) +enum { + TRACE_INTERNAL_BIT = 11, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_GLOBAL_BIT, + TRACE_GLOBAL_NMI_BIT, + TRACE_GLOBAL_IRQ_BIT, + TRACE_GLOBAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, /* * Abuse of the trace_recursion. @@ -315,11 +318,12 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<20) + TRACE_IRQ_BIT, +}; -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) #define TRACE_PIPE_ALL_CPU -1 -- cgit v1.2.3-70-g09d2 From edc15cafcbfa3d73f819cae99885a2e35e4cbce5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:47:21 -0400 Subject: tracing: Avoid unnecessary multiple recursion checks When function tracing occurs, the following steps are made: If arch does not support a ftrace feature: call internal function (uses INTERNAL bits) which calls... If callback is registered to the "global" list, the list function is called and recursion checks the GLOBAL bits. then this function calls... The function callback, which can use the FTRACE bits to check for recursion. Now if the arch does not suppport a feature, and it calls the global list function which calls the ftrace callback all three of these steps will do a recursion protection. There's no reason to do one if the previous caller already did. The recursion that we are protecting against will go through the same steps again. To prevent the multiple recursion checks, if a recursion bit is set that is higher than the MAX bit of the current check, then we know that the check was made by the previous caller, and we can skip the current check. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++-------------- kernel/trace/trace.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 639b6ab1f04..ce8c3d68292 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -158,25 +158,15 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, { int bit; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_GLOBAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_GLOBAL_IRQ_BIT; - else - bit = TRACE_GLOBAL_SIRQ_BIT; - } else - bit = TRACE_GLOBAL_BIT; - - if (unlikely(trace_recursion_test(bit))) + bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); + if (bit < 0) return; - trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(bit); + + trace_clear_recursion(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4145,26 +4135,14 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; - unsigned int bit; + int bit; if (function_trace_stop) return; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_INTERNAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_INTERNAL_IRQ_BIT; - else - bit = TRACE_INTERNAL_SIRQ_BIT; - } else - bit = TRACE_INTERNAL_BIT; - - if (unlikely(trace_recursion_test(bit))) - return; - - trace_recursion_set(bit); + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; /* * Some of the ops may be dynamically allocated, @@ -4176,7 +4154,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(bit); + trace_clear_recursion(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5a095d6f088..c203a51dd41 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -297,18 +297,49 @@ struct tracer { /* Ring buffer has the 10 LSB bits to count */ #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) -/* for function tracing recursion */ +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ enum { - TRACE_INTERNAL_BIT = 11, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, + TRACE_FTRACE_BIT = 11, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + /* GLOBAL_BITs must be greater than FTRACE_BITs */ TRACE_GLOBAL_BIT, TRACE_GLOBAL_NMI_BIT, TRACE_GLOBAL_IRQ_BIT, TRACE_GLOBAL_SIRQ_BIT, + /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + TRACE_CONTROL_BIT, /* @@ -325,6 +356,71 @@ enum { #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT +#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + #define TRACE_PIPE_ALL_CPU -1 static inline struct ring_buffer_iter * -- cgit v1.2.3-70-g09d2 From 897f68a48b1f8fb6cb7493e1ee37e3ed7f879937 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:52:35 -0400 Subject: ftrace: Use only the preempt version of function tracing The function tracer had two different versions of function tracing. The disabling of irqs version and the preempt disable version. As function tracing in very intrusive and can cause nasty recursion issues, it has its own recursion protection. But the old method to do this was a flat layer. If it detected that a recursion was happening then it would just return without recording. This made the preempt version (much faster than the irq disabling one) not very useful, because if an interrupt were to occur after the recursion flag was set, the interrupt would not be traced at all, because every function that was traced would think it recursed on itself (due to the context it preempted setting the recursive flag). Now that we have a recursion flag for every context level, we no longer need to worry about that. We can disable preemption, set the current context recursion check bit, and go on. If an interrupt were to come along, it would check its own context bit and happily continue to trace. As the preempt version is faster than the irq disable version, there's no more reason to keep the preempt version around. And the irq disable version still had an issue with missing out on tracing NMI code. Remove the irq disable function tracer version and have the preempt disable version be the default (and only version). Before this patch we had from running: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 12.028 Time: 11.945 Time: 11.925 Time: 11.964 Time: 12.002 Time: 11.910 Time: 11.944 Time: 11.929 Time: 11.941 Time: 11.924 (average: 11.9512) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) a 13.8% savings! Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 61 ++++++++++-------------------------------- 1 file changed, 14 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab..1c327ef13a9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - /* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, @@ -85,34 +57,34 @@ static struct tracer_flags func_flags; static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) - { struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; + unsigned int bit; int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + pc = preempt_count(); + preempt_disable_notrace(); - if (likely(disabled == 1)) { - pc = preempt_count(); + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; + + cpu = smp_processor_id(); + data = tr->data[cpu]; + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } + trace_clear_recursion(bit); - atomic_dec(&data->disabled); - local_irq_restore(flags); + out: + preempt_enable_notrace(); } static void @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void) { ftrace_function_enabled = 0; - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - if (func_flags.val & TRACE_FUNC_OPT_STACK) register_ftrace_function(&trace_stack_ops); else -- cgit v1.2.3-70-g09d2 From 567cd4da54ff45513d2ca1f0e3cb9ba45b66d6cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 18:33:05 -0400 Subject: ring-buffer: User context bit recursion checking Using context bit recursion checking, we can help increase the performance of the ring buffer. Before this patch: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 9.712 Time: 9.824 Time: 9.861 Time: 9.827 Time: 9.962 Time: 9.905 Time: 9.886 Time: 10.088 Time: 9.861 Time: 9.834 (average: 9.876) a 4% savings! Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 85 ++++++++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 13 +++---- 2 files changed, 67 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6ff9cc4658e..481e2626928 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2432,41 +2432,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); - - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + unsigned int val = this_cpu_read(current_context); + int bit; - WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; + if (unlikely(val & (1 << bit))) + return 1; - trace_recursive_fail(); + val |= (1 << bit); + this_cpu_write(current_context, val); - return -1; + return 0; } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!trace_recursion_buffer()); + unsigned int val = this_cpu_read(current_context); - trace_recursion_dec(); + val--; + val &= this_cpu_read(current_context); + this_cpu_write(current_context, val); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c203a51dd41..04a2c7ab173 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -291,11 +291,6 @@ struct tracer { /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* * For function tracing recursion: @@ -323,7 +318,13 @@ struct tracer { * caller, and we can skip the current check. */ enum { - TRACE_FTRACE_BIT = 11, + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, -- cgit v1.2.3-70-g09d2 From 0b07436d95b5404134da4d661fd183eac863513e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 16:58:30 -0500 Subject: ring-buffer: Remove trace.h from ring_buffer.c ring_buffer.c use to require declarations from trace.h, but these have moved to the generic header files. There's nothing in trace.h that ring_buffer.c requires. There's some headers that trace.h included that ring_buffer.c needs, but it's best that it includes them directly, and not include trace.h. Also, some things may use ring_buffer.c without having tracing configured. This removes the dependency that may come in the future. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 481e2626928..13950d9027c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,8 +3,10 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include +#include #include #include #include @@ -21,7 +23,6 @@ #include #include -#include "trace.h" static void update_pages_handler(struct work_struct *work); -- cgit v1.2.3-70-g09d2 From d41032a83b4683481cadff84bbf8e0eafeaba830 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 24 Jan 2013 07:52:34 -0500 Subject: tracing: Fix unsigned int compare of zero in recursion check Dan's smatch found a compare bug with the result of the trace_test_and_set_recursion() and comparing to less than zero. If the function fails, it returns -1, but was saved in an unsigned int, which will never be less than zero and will ignore the result of the test if a recursion did happen. Luckily this is the last of the recursion tests, as the infrastructure of ftrace would catch recursions before it got here, except for some few exceptions. Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 1c327ef13a9..60115252332 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -61,7 +61,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - unsigned int bit; + int bit; int cpu; int pc; -- cgit v1.2.3-70-g09d2 From ba6fdda46b377034c782c0b89c8f1090b31eabd8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 22 Dec 2012 16:59:51 +0100 Subject: profiling: Remove unused timer hook The last remaining user was oprofile and its use has been removed a while ago in commit bc078e4eab65f11bba ("oprofile: convert oprofile from timer_hook to hrtimer"). There doesn't seem to be any upstream user of this hook for about two years now. And I'm not even aware of any out of tree user. Let's remove it. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1356191991-2251-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/profile.h | 13 ------------- kernel/profile.c | 24 ------------------------ 2 files changed, 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/profile.h b/include/linux/profile.h index a0fc32279fc..21123902366 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -82,9 +82,6 @@ int task_handoff_unregister(struct notifier_block * n); int profile_event_register(enum profile_type, struct notifier_block * n); int profile_event_unregister(enum profile_type, struct notifier_block * n); -int register_timer_hook(int (*hook)(struct pt_regs *)); -void unregister_timer_hook(int (*hook)(struct pt_regs *)); - struct pt_regs; #else @@ -135,16 +132,6 @@ static inline int profile_event_unregister(enum profile_type t, struct notifier_ #define profile_handoff_task(a) (0) #define profile_munmap(a) do { } while (0) -static inline int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - return -ENOSYS; -} - -static inline void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - return; -} - #endif /* CONFIG_PROFILING */ #endif /* _LINUX_PROFILE_H */ diff --git a/kernel/profile.c b/kernel/profile.c index 1f391819c42..dc3384ee874 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -37,9 +37,6 @@ struct profile_hit { #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; - static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; @@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - if (timer_hook) - return -EBUSY; - timer_hook = hook; - return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - WARN_ON(hook != timer_hook); - timer_hook = NULL; - /* make sure all CPUs see the NULL hook */ - synchronize_sched(); /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - - #ifdef CONFIG_SMP /* * Each cpu has a pair of open-addressed hashtables for pending @@ -436,8 +414,6 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) - timer_hook(regs); if (!user_mode(regs) && prof_cpu_mask != NULL && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); -- cgit v1.2.3-70-g09d2 From c91368c4889a0ee5dd06552adbb50ae54f5096fd Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 20 Dec 2012 14:11:30 -0500 Subject: uprobes: remove redundant check We checked for uprobe==NULL earlier, no need to redo that. Signed-off-by: Sasha Levin Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1356030701-16284-22-git-send-email-sasha.levin@oracle.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dea7acfbb07..30ea9a4f4ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -901,8 +901,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume } mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + put_uprobe(uprobe); } static struct rb_node * -- cgit v1.2.3-70-g09d2 From b736f48bda54ec75b7dc9306884c3843f1a78a0a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 18 Nov 2012 21:27:45 -0800 Subject: tracing: Mark tracing_dentry_percpu() static Nothing outside of kernel/trace/trace.c references tracing_dentry_percpu(). Link: http://lkml.kernel.org/r/1353302917-13995-7-git-send-email-josh@joshtriplett.org Signed-off-by: Josh Triplett Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d2a658349ca..ca9b7dfed8e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4506,7 +4506,7 @@ struct dentry *tracing_init_dentry(void) static struct dentry *d_percpu; -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(void) { static int once; struct dentry *d_tracer; -- cgit v1.2.3-70-g09d2 From 821465295b36136998ef294fe176fba4e09c1cd9 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Mon, 19 Nov 2012 13:21:01 +0800 Subject: tracing: Use __this_cpu_inc/dec operation instead of __get_cpu_var __this_cpu_inc_return() or __this_cpu_dec generates a single instruction, which is faster than __get_cpu_var operation. Link: http://lkml.kernel.org/r/50A9C1BD.1060308@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca9b7dfed8e..07888e15c69 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1344,7 +1344,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = ++__get_cpu_var(ftrace_stack_reserve); + use_stack = __this_cpu_inc_return(ftrace_stack_reserve); /* * We don't need any atomic variables, just a barrier. * If an interrupt comes in, we don't care, because it would @@ -1398,7 +1398,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, out: /* Again, don't let gcc optimize things here */ barrier(); - __get_cpu_var(ftrace_stack_reserve)--; + __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } -- cgit v1.2.3-70-g09d2 From 38dbe0b137bfe6ea92be495017885c0785179a02 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Fri, 25 Jan 2013 18:03:07 +0800 Subject: tracing: Remove second iterator initializer The trace iterator is already initialized by trace_init_global_iter(), so there is no need to initialize it again. Link: http://lkml.kernel.org/r/CACV3sb+G1YnO6168JhY3dEadmJi58pA5-2cSZT8E0WVHJNFt9Q@mail.gmail.com Signed-off-by: Jovi Zhang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 07888e15c69..d399592701a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5030,6 +5030,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { @@ -5041,10 +5042,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - /* Simulate the iterator */ - iter.tr = &global_trace; - iter.trace = current_trace; - switch (oops_dump_mode) { case DUMP_ALL: iter.cpu_file = TRACE_PIPE_ALL_CPU; -- cgit v1.2.3-70-g09d2 From 03274a3ffb449632970fdd35da72ea41cf8474da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:30:31 -0500 Subject: tracing/fgraph: Adjust fgraph depth before calling trace return callback While debugging the virtual cputime with the function graph tracer with a max_depth of 1 (most common use of the max_depth so far), I found that I was missing kernel execution because of a race condition. The code for the return side of the function has a slight race: ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; The ftrace_pop_return_trace() initializes the trace structure for the callback. The ftrace_graph_return() uses the trace structure for its own use as that structure is on the stack and is local to this function. Then the curr_ret_stack is decremented which is what the trace.depth is set to. If an interrupt comes in after the ftrace_graph_return() but before the curr_ret_stack, then the called function will get a depth of 2. If max_depth is set to 1 this function will be ignored. The problem is that the trace has already been called, and the timestamp for that trace will not reflect the time the function was about to re-enter userspace. Calls to the interrupt will not be traced because the max_depth has prevented this. To solve this issue, the ftrace_graph_return() can safely be moved after the current->curr_ret_stack has been updated. This way the timestamp for the return callback will reflect the actual time. If an interrupt comes in after the curr_ret_stack update and ftrace_graph_return(), it will be traced. It may look a little confusing to see it within the other function, but at least it will not be lost. Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7008d2e13cf..39ada66389c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -191,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); + if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit v1.2.3-70-g09d2 From ad964704ba9326d027fc10fd0099b7c880e50172 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:45:49 -0500 Subject: ring-buffer: Add stats field for amount read from trace ring buffer Add a stat about the number of events read from the ring buffer: # cat /debug/tracing/per_cpu/cpu0/stats entries: 39869 overrun: 870512 commit overrun: 0 bytes: 1449912 oldest event ts: 6561.368690 now ts: 6565.246426 dropped events: 0 read events: 112 <-- Added Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 18 ++++++++++++++++++ kernel/trace/trace.c | 3 +++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 519777e3fa0..1342e69542f 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -167,6 +167,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 13950d9027c..7244acde77b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3102,6 +3102,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d399592701a..90a1c71fdbf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4430,6 +4430,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); + cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3-70-g09d2 From 5e67b51e3fb22ad43faf9589e9019ad9c6a00413 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 27 Dec 2012 11:49:45 +0900 Subject: tracing: Use sched_clock_cpu for trace_clock_global For systems with an unstable sched_clock, all cpu_clock() does is enable/ disable local irq during the call to sched_clock_cpu(). And for stable systems they are same. trace_clock_global() already disables interrupts, so it can call sched_clock_cpu() directly. Link: http://lkml.kernel.org/r/1356576585-28782-2-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 22b638b28e4..24bf48eabfc 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void) local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = cpu_clock(this_cpu); + now = sched_clock_cpu(this_cpu); /* * If in an NMI context then dont risk lockups and return the * cpu_clock() time: -- cgit v1.2.3-70-g09d2 From 2fd196ec1eab2623096e7fc7e6f3976160392bce Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 26 Dec 2012 11:52:52 +0900 Subject: tracing: Replace static old_tracer check of tracer name Currently the trace buffer read functions use a static variable "old_tracer" for detecting if the current tracer changes. This was suitable for a single trace file ("trace"), but to add a snapshot feature that will use the same function for its file, a check against a static variable is not sufficient. To use the output functions for two different files, instead of storing the current tracer in a static variable, as the trace iterator descriptor contains a pointer to the original current tracer's name, that pointer can now be used to check if the current tracer has changed between different reads of the trace file. Link: http://lkml.kernel.org/r/20121226025252.3252.9276.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 90a1c71fdbf..2c724662a3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1948,18 +1948,20 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; - static struct tracer *old_tracer; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; - /* copy the tracer to avoid using a global lock all around */ + /* + * copy the tracer to avoid using a global lock all around. + * iter->trace is a copy of current_trace, the pointer to the + * name may be used instead of a strcmp(), as iter->trace->name + * will point to the same string as current_trace->name. + */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); atomic_inc(&trace_record_cmdline_disabled); @@ -3494,7 +3496,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - static struct tracer *old_tracer; ssize_t sret; /* return any leftover data */ @@ -3506,10 +3507,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); /* @@ -3665,7 +3664,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - static struct tracer *old_tracer; ssize_t ret; size_t rem; unsigned int i; @@ -3675,10 +3673,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); -- cgit v1.2.3-70-g09d2 From debdd57f5145f3c6a4b3f8d0126abd1a2def7fc6 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 26 Dec 2012 11:53:00 +0900 Subject: tracing: Make a snapshot feature available from userspace Ftrace has a snapshot feature available from kernel space and latency tracers (e.g. irqsoff) are using it. This patch enables user applictions to take a snapshot via debugfs. Add "snapshot" debugfs file in "tracing" directory. snapshot: This is used to take a snapshot and to read the output of the snapshot. # echo 1 > snapshot This will allocate the spare buffer for snapshot (if it is not allocated), and take a snapshot. # cat snapshot This will show contents of the snapshot. # echo 0 > snapshot This will free the snapshot if it is allocated. Any other positive values will clear the snapshot contents if the snapshot is allocated, or return EINVAL if it is not allocated. Link: http://lkml.kernel.org/r/20121226025300.3252.86850.stgit@liselsia Cc: Jiri Olsa Cc: David Sharp Signed-off-by: Hiraku Toyooka [ Fixed irqsoff selftest and also a conflict with a change that fixes the update_max_tr. ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 + kernel/trace/Kconfig | 10 +++ kernel/trace/trace.c | 166 ++++++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 1 + 4 files changed, 154 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 6f8d0b77006..13a54d0bdfa 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -83,6 +83,9 @@ struct trace_iterator { long idx; cpumask_var_t started; + + /* it's true when current open file is snapshot */ + bool snapshot; }; enum trace_iter_flags { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cdc9d284d24..36567564e22 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -253,6 +253,16 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/debug/tracing/snapshot + cat snapshot + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c724662a3e..70dce64b9ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -710,12 +710,11 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); - /* If we disabled the tracer, stop now */ - if (current_trace == &nop_trace) - return; - - if (WARN_ON_ONCE(!current_trace->use_max_tr)) + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); @@ -743,10 +742,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) return; - } arch_spin_lock(&ftrace_max_lock); @@ -866,10 +863,13 @@ int register_tracer(struct tracer *type) current_trace = type; - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + type->allocated_snapshot = true; + } /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -885,10 +885,14 @@ int register_tracer(struct tracer *type) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); - /* Shrink the max buffer again */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + type->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + } printk(KERN_CONT "PASSED\n"); } @@ -1964,7 +1968,11 @@ static void *s_start(struct seq_file *m, loff_t *pos) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); - atomic_inc(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); + + if (!iter->snapshot) + atomic_inc(&trace_record_cmdline_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -2003,7 +2011,11 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return; + + if (!iter->snapshot) + atomic_dec(&trace_record_cmdline_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2438,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { long cpu_file = (long) inode->i_private; struct trace_iterator *iter; @@ -2471,10 +2483,11 @@ __tracing_open(struct inode *inode, struct file *file) if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace && current_trace->print_max) + if ((current_trace && current_trace->print_max) || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; + iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); iter->cpu_file = cpu_file; @@ -2491,8 +2504,9 @@ __tracing_open(struct inode *inode, struct file *file) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping */ - tracing_stop(); + /* stop the trace while dumping if we are not opening "snapshot" */ + if (!iter->snapshot) + tracing_stop(); if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { @@ -2555,8 +2569,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - /* reenable tracing if it was previously enabled */ - tracing_start(); + if (!iter->snapshot) + /* reenable tracing if it was previously enabled */ + tracing_start(); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2584,7 +2599,7 @@ static int tracing_open(struct inode *inode, struct file *file) } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) @@ -3219,7 +3234,7 @@ static int tracing_set_tracer(const char *buf) if (current_trace && current_trace->reset) current_trace->reset(tr); - had_max_tr = current_trace && current_trace->use_max_tr; + had_max_tr = current_trace && current_trace->allocated_snapshot; current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3238,6 +3253,8 @@ static int tracing_set_tracer(const char *buf) */ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; } destroy_trace_option_files(topts); @@ -3248,6 +3265,7 @@ static int tracing_set_tracer(const char *buf) RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; + t->allocated_snapshot = true; } if (t->init) { @@ -4066,6 +4084,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return single_open(file, tracing_clock_show, NULL); } +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret = 0; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } + return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + + if (current_trace && current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + switch (val) { + case 0: + if (current_trace->allocated_snapshot) { + /* free spare buffer */ + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; + } + break; + case 1: + if (!current_trace->allocated_snapshot) { + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&max_tr, + &global_trace, RING_BUFFER_ALL_CPUS); + if (ret < 0) + break; + current_trace->allocated_snapshot = true; + } + + local_irq_disable(); + /* Now, we're going to swap */ + update_max_tr(&global_trace, current, smp_processor_id()); + local_irq_enable(); + break; + default: + if (current_trace->allocated_snapshot) + tracing_reset_online_cpus(&max_tr); + else + ret = -EINVAL; + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: + mutex_unlock(&trace_types_lock); + return ret; +} +#endif /* CONFIG_TRACER_SNAPSHOT */ + + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -4122,6 +4221,16 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_seek, + .release = tracing_release, +}; +#endif /* CONFIG_TRACER_SNAPSHOT */ + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -4921,6 +5030,11 @@ static __init int tracer_init_debugfs(void) &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); +#endif + create_trace_options_dir(); for_each_tracing_cpu(cpu) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 04a2c7ab173..57d7e5397d5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -287,6 +287,7 @@ struct tracer { struct tracer_flags *flags; bool print_max; bool use_max_tr; + bool allocated_snapshot; }; -- cgit v1.2.3-70-g09d2 From d840f718d28715a9833c1a8f46c2493ff3fd219b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 1 Feb 2013 18:38:47 -0500 Subject: tracing: Init current_trace to nop_trace and remove NULL checks On early boot up, when the ftrace ring buffer is initialized, the static variable current_trace is initialized to &nop_trace. Before this initialization, current_trace is NULL and will never become NULL again. It is always reassigned to a ftrace tracer. Several places check if current_trace is NULL before it uses it, and this check is frivolous, because at the point in time when the checks are made the only way current_trace could be NULL is if ftrace failed its allocations at boot up, and the paths to these locations would probably not be possible. By initializing current_trace to &nop_trace where it is declared, current_trace will never be NULL, and we can remove all these checks of current_trace being NULL which never needed to be checked in the first place. Cc: Dan Carpenter Cc: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70dce64b9ec..5d520b7bb4c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -249,7 +249,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; static struct tracer *trace_types __read_mostly; /* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; +static struct tracer *current_trace __read_mostly = &nop_trace; /* * trace_types_lock is used to protect the trace_types list. @@ -2100,8 +2100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) unsigned long total; const char *name = "preemption"; - if (type) - name = type->name; + name = type->name; get_total_entries(tr, &total, &entries); @@ -2477,13 +2476,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->trace) goto fail; - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if ((current_trace && current_trace->print_max) || snapshot) + if (current_trace->print_max || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; @@ -3037,10 +3035,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int r; mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); + r = sprintf(buf, "%s\n", current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3231,10 +3226,10 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); - if (current_trace && current_trace->reset) + if (current_trace->reset) current_trace->reset(tr); - had_max_tr = current_trace && current_trace->allocated_snapshot; + had_max_tr = current_trace->allocated_snapshot; current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3373,8 +3368,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3525,7 +3519,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); @@ -3691,7 +3685,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); @@ -4115,7 +4109,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&trace_types_lock); - if (current_trace && current_trace->use_max_tr) { + if (current_trace->use_max_tr) { ret = -EBUSY; goto out; } @@ -5299,7 +5293,7 @@ __init static int tracer_alloc_buffers(void) init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); - current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; -- cgit v1.2.3-70-g09d2 From bbc33d05930f870ea049eae5ed980f8b827d0813 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 16:55:38 +0100 Subject: uprobes: Move __set_bit(UPROBE_SKIP_SSTEP) into alloc_uprobe() Cosmetic. __set_bit(UPROBE_SKIP_SSTEP) is the part of initialization, it is not clear why it is set in insert_uprobe(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 30ea9a4f4ab..afbab2cb274 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -430,9 +430,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) u = __insert_uprobe(uprobe); spin_unlock(&uprobes_treelock); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - return u; } @@ -454,6 +451,8 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); mutex_init(&uprobe->copy_mutex); + /* For now assume that the instruction need not be single-stepped */ + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); -- cgit v1.2.3-70-g09d2 From f0744af7d0fde190674064c54e2ff60b34ac71fe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 18:01:43 +0100 Subject: uprobes: Kill the pointless inode/uc checks in register/unregister register/unregister verifies that inode/uc != NULL. For what? This really looks like "hide the potential problem", the caller should pass the valid data. register() also checks uc->next == NULL, probably to prevent the double-register but the caller can do other stupid/wrong things. If we do this check, then we should document that uc->next should be cleared before register() and add BUG_ON(). Also add the small comment about the i_size_read() check. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index afbab2cb274..a39d8163b71 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -844,9 +844,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; - if (!inode || !uc || uc->next) - return -EINVAL; - + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -883,9 +881,6 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume { struct uprobe *uprobe; - if (!inode || !uc) - return; - uprobe = find_uprobe(inode, offset); if (!uprobe) return; -- cgit v1.2.3-70-g09d2 From fe20d71f25400cccc8bffef865f79250be7dbc81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 17:32:30 +0100 Subject: uprobes: Kill uprobe_consumer->filter() uprobe_consumer->filter() is pointless in its current form, kill it. We will add it back, but with the different signature/semantics. Perhaps we will even re-introduce the callsite in handler_chain(), but not to just skip uc->handler(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 5 ----- kernel/events/uprobes.c | 6 ++---- kernel/trace/trace_uprobe.c | 1 - 3 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f628a6fc5b..83742b91ff7 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -37,11 +37,6 @@ struct inode; struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); - /* - * filter is optional; If a filter exists, handler is run - * if and only if filter returns true. - */ - bool (*filter)(struct uprobe_consumer *self, struct task_struct *task); struct uprobe_consumer *next; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a39d8163b71..5cbebac27c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -477,10 +477,8 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) return; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (!uc->filter || uc->filter(uc, current)) - uc->handler(uc, regs); - } + for (uc = uprobe->consumers; uc; uc = uc->next) + uc->handler(uc, regs); up_read(&uprobe->consumer_rwsem); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 87b6db4ccbc..e668024773d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -550,7 +550,6 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; - utc->cons.filter = NULL; ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { kfree(utc); -- cgit v1.2.3-70-g09d2 From 63633cbf82840d972248f11d2122b261d0d4779a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 22 Nov 2012 18:30:15 +0100 Subject: uprobes: Introduce filter_chain() Add the new helper filter_chain(). Currently it is only placeholder, the comment explains what is should do. We will change it later to consult every consumer to decide whether we need to install the swbp. Until then it works as if any consumer returns true, this matches the current behavior. Change install_breakpoint() to call filter_chain() instead of checking uprobe->consumers != NULL. We obviously need this, and this equally closes the race with _unregister(). Change remove_breakpoint() to call this helper too. Currently this is pointless because remove_breakpoint() is only called when the last consumer goes away, but we will change this. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5cbebac27c0..c38bf37d0ac 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -614,6 +614,18 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } +static bool filter_chain(struct uprobe *uprobe) +{ + /* + * TODO: + * for_each_consumer(uc) + * if (uc->filter(...)) + * return true; + * return false; + */ + return uprobe->consumers != NULL; +} + static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) @@ -624,11 +636,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, /* * If probe is being deleted, unregister thread could be done with * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. Also we could be from fork or - * mremap path, where the probe might have already been inserted. - * Hence behave as if probe already existed. + * nobody will be able to cleanup. But in this case filter_chain() + * must return false, all consumers have gone away. */ - if (!uprobe->consumers) + if (!filter_chain(uprobe)) return 0; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); @@ -655,10 +666,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - /* can happen if uprobe_register() fails */ if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) return 0; + if (filter_chain(uprobe)) + return 0; + set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -1382,6 +1395,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. + * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; -- cgit v1.2.3-70-g09d2 From 04aab9b2006bbdeff78dc162f206fdfebeca97d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Nov 2012 19:43:50 +0100 Subject: uprobes: _unregister() should always do register_for_each_vma(false) uprobe_unregister() removes the breakpoints only if the last consumer goes away. To support the filtering it should do this every time, we want to remove the breakpoints which nobody else want to keep. Note: given that filter_chain() is not actually implemented, this patch itself doesn't change the behaviour yet, register_for_each_vma(false) is a heavy "nop" unless there are no more consumers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c38bf37d0ac..94019908463 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -825,12 +825,20 @@ static int __uprobe_register(struct uprobe *uprobe) return register_for_each_vma(uprobe, true); } -static void __uprobe_unregister(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - if (!register_for_each_vma(uprobe, false)) - delete_uprobe(uprobe); + int err; + + if (!consumer_del(uprobe, uc)) /* WARN? */ + return; - /* TODO : cant unregister? schedule a worker thread */ + err = register_for_each_vma(uprobe, false); + if (!uprobe->consumers) { + clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); + /* TODO : cant unregister? schedule a worker thread */ + if (!err) + delete_uprobe(uprobe); + } } /* @@ -868,8 +876,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } else if (!consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { - uprobe->consumers = NULL; - __uprobe_unregister(uprobe); + __uprobe_unregister(uprobe, uc); } else { set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } @@ -897,14 +904,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume return; mutex_lock(uprobes_hash(inode)); - - if (consumer_del(uprobe, uc)) { - if (!uprobe->consumers) { - __uprobe_unregister(uprobe); - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } - } - + __uprobe_unregister(uprobe, uc); mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } -- cgit v1.2.3-70-g09d2 From 9a98e03cc145c994da824dac7602334f50feb670 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Nov 2012 20:15:17 +0100 Subject: uprobes: _register() should always do register_for_each_vma(true) To support the filtering uprobe_register() should do register_for_each_vma(true) every time the new consumer comes, we need to install the previously nacked breakpoints. Note: - uprobes_mutex[] should die, what it actually protects is alloc_uprobe(). - UPROBE_RUN_HANDLER should die too, obviously it can't work unless uprobe has a single consumer. The consumer should serialize with _register/_unregister itself. Or this flag should live in uprobe_consumer->state. - Perhaps we can do some optimizations later. For example, if filter_chain() never returns false uprobe can record this fact and avoid the unnecessary register_for_each_vma(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 94019908463..d1d1394bca8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -482,16 +482,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->consumer_rwsem); } -/* Returns the previous consumer */ -static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); uc->next = uprobe->consumers; uprobe->consumers = uc; up_write(&uprobe->consumer_rwsem); - - return uc->next; } /* @@ -820,9 +816,15 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) return err; } -static int __uprobe_register(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - return register_for_each_vma(uprobe, true); + int err; + + consumer_add(uprobe, uc); + err = register_for_each_vma(uprobe, true); + if (!err) /* TODO: pointless unless the first consumer */ + set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); + return err; } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -867,21 +869,14 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (offset > i_size_read(inode)) return -EINVAL; - ret = 0; + ret = -ENOMEM; mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - - if (!uprobe) { - ret = -ENOMEM; - } else if (!consumer_add(uprobe, uc)) { - ret = __uprobe_register(uprobe); - if (ret) { + if (uprobe) { + ret = __uprobe_register(uprobe, uc); + if (ret) __uprobe_unregister(uprobe, uc); - } else { - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } } - mutex_unlock(uprobes_hash(inode)); if (uprobe) put_uprobe(uprobe); -- cgit v1.2.3-70-g09d2 From e591c8d78e49e6206935cf31c4d2b603bbb29166 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 17:29:40 +0100 Subject: uprobes: Introduce uprobe->register_rwsem Introduce uprobe->register_rwsem. It is taken for writing around __uprobe_register/unregister. Change handler_chain() to use this sem rather than consumer_rwsem. The main reason for this change is that we have the nasty problem with mmap_sem/consumer_rwsem dependency. filter_chain() needs to protect uprobe->consumers like handler_chain(), but they can not use the same lock. filter_chain() can be called under ->mmap_sem (currently this is always true), but we want to allow ->handler() to play with the probed task's memory, and this needs ->mmap_sem. Alternatively we could use srcu, but synchronize_srcu() is very slow and ->register_rwsem allows us to do more. In particular, we can teach handler_chain() to do remove_breakpoint() if this bp is "nacked" by all consumers, we know that we can't race with the new consumer which does uprobe_register(). See also the next patches. uprobes_mutex[] is almost ready to die. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1d1394bca8..61d0fa6b501 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -91,6 +91,7 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; + struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; @@ -449,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; + init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); mutex_init(&uprobe->copy_mutex); /* For now assume that the instruction need not be single-stepped */ @@ -476,10 +478,10 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) return; - down_read(&uprobe->consumer_rwsem); + down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) uc->handler(uc, regs); - up_read(&uprobe->consumer_rwsem); + up_read(&uprobe->register_rwsem); } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -873,9 +875,11 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); if (uprobe) { + down_write(&uprobe->register_rwsem); ret = __uprobe_register(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); } mutex_unlock(uprobes_hash(inode)); if (uprobe) @@ -899,7 +903,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume return; mutex_lock(uprobes_hash(inode)); + down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } -- cgit v1.2.3-70-g09d2 From 1ff6fee5e62c57d5923b805bb4206acb7953f16e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:15:46 +0100 Subject: uprobes: Change filter_chain() to iterate ->consumers list Now that it safe to use ->consumer_rwsem under ->mmap_sem we can almost finish the implementation of filter_chain(). It still lacks the actual uc->filter(...) call but othewrwise it is ready, just it pretends that ->filter() always returns true. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 61d0fa6b501..4d045236368 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -614,14 +614,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, static bool filter_chain(struct uprobe *uprobe) { - /* - * TODO: - * for_each_consumer(uc) - * if (uc->filter(...)) - * return true; - * return false; - */ - return uprobe->consumers != NULL; + struct uprobe_consumer *uc; + bool ret = false; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + /* TODO: ret = uc->filter(...) */ + ret = true; + if (ret) + break; + } + up_read(&uprobe->consumer_rwsem); + + return ret; } static int -- cgit v1.2.3-70-g09d2 From bb929284be40cbbdb347690742557d708fd504a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:27:08 +0100 Subject: uprobes: Kill UPROBE_RUN_HANDLER flag Simply remove UPROBE_RUN_HANDLER and the corresponding code. It can only help if uprobe has a single consumer, and in fact it is no longer needed after handler_chain() was changed to use ->register_rwsem, we simply can not race with uprobe_register(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4d045236368..7ec2eb27863 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -83,10 +83,8 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER 1 /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 2 +#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -475,9 +473,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) - return; - down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) uc->handler(uc, regs); @@ -825,13 +820,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - int err; - consumer_add(uprobe, uc); - err = register_for_each_vma(uprobe, true); - if (!err) /* TODO: pointless unless the first consumer */ - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - return err; + return register_for_each_vma(uprobe, true); } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -842,12 +832,9 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u return; err = register_for_each_vma(uprobe, false); - if (!uprobe->consumers) { - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - /* TODO : cant unregister? schedule a worker thread */ - if (!err) - delete_uprobe(uprobe); - } + /* TODO : cant unregister? schedule a worker thread */ + if (!uprobe->consumers && !err) + delete_uprobe(uprobe); } /* -- cgit v1.2.3-70-g09d2 From d4d3ccc6d1eb74bd315d49a3829c5ad6c48d21b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:51:34 +0100 Subject: uprobes: Kill uprobe->copy_mutex Now that ->register_rwsem is safe under ->mmap_sem we can kill ->copy_mutex and abuse down_write(&uprobe->consumer_rwsem). This makes prepare_uprobe() even more ugly, but we should kill it anyway. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ec2eb27863..40ced98b2bc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -91,7 +91,6 @@ struct uprobe { atomic_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; - struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ @@ -450,7 +449,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - mutex_init(&uprobe->copy_mutex); /* For now assume that the instruction need not be single-stepped */ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); @@ -578,7 +576,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; - mutex_lock(&uprobe->copy_mutex); + /* TODO: move this into _register, until then we abuse this sem. */ + down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; @@ -602,7 +601,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: - mutex_unlock(&uprobe->copy_mutex); + up_write(&uprobe->consumer_rwsem); return ret; } -- cgit v1.2.3-70-g09d2 From 441f1eb7db8babe2b6b4bc805f023739dbb70e33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 19:54:29 +0100 Subject: uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead uprobe_events counts the number of uprobes in uprobes_tree but it is used as a boolean. We can use RB_EMPTY_ROOT() instead. Probably no_uprobe_events() added by this patch can have more callers, say, mmf_recalc_uprobes(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 40ced98b2bc..5d38b40644b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -41,6 +41,11 @@ #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; +/* + * allows us to skip the uprobe_mmap if there are no uprobe events active + * at this time. Probably a fine grained per inode count is better? + */ +#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -74,13 +79,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; static struct percpu_rw_semaphore dup_mmap_sem; -/* - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe - * events active at this time. Probably a fine grained per inode count is - * better? - */ -static atomic_t uprobe_events = ATOMIC_INIT(0); - /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 /* Can skip singlestep */ @@ -460,8 +458,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) kfree(uprobe); uprobe = cur_uprobe; iput(inode); - } else { - atomic_inc(&uprobe_events); } return uprobe; @@ -685,7 +681,6 @@ static void delete_uprobe(struct uprobe *uprobe) spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); - atomic_dec(&uprobe_events); } struct map_info { @@ -975,7 +970,7 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + if (no_uprobe_events() || !valid_vma(vma, true)) return 0; inode = vma->vm_file->f_mapping->host; @@ -1021,7 +1016,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ -- cgit v1.2.3-70-g09d2 From 06b7bcd8cbd7eb1af331e437ec3d8f5182ae1b7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 22:01:42 +0100 Subject: uprobes: Introduce uprobe_is_active() The lifetime of uprobe->rb_node and uprobe->inode is not refcounted, delete_uprobe() is called when we detect that uprobe has no consumers, and it would be deadly wrong to do this twice. Change delete_uprobe() to WARN() if it was already called. We use RB_CLEAR_NODE() to mark uprobe "inactive", then RB_EMPTY_NODE() can be used to detect this case. RB_EMPTY_NODE() is not used directly, we add the trivial helper for the next change. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5d38b40644b..358baddc8ac 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -669,6 +669,10 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} /* * There could be threads that have already hit the breakpoint. They * will recheck the current insn and restart if find_uprobe() fails. @@ -676,9 +680,13 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { + if (WARN_ON(!uprobe_is_active(uprobe))) + return; + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); + RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ iput(uprobe->inode); put_uprobe(uprobe); } -- cgit v1.2.3-70-g09d2 From 66d06dffa5ef6f3544997440af63a91ef36a2171 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 22:48:37 +0100 Subject: uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() uprobe_register() and uprobe_unregister() are the only users of mutex_lock(uprobes_hash(inode)), and the only reason why we can't simply remove it is that we need to ensure that delete_uprobe() is not possible after alloc_uprobe() and before consumer_add(). IOW, we need to ensure that when we take uprobe->register_rwsem this uprobe is still valid and we didn't race with _unregister() which called delete_uprobe() in between. With this patch uprobe_register() simply checks uprobe_is_active() and retries if it hits this very unlikely race. uprobes_mutex[] is no longer needed and can be removed. There is another reason for this change, prepare_uprobe() should be folded into alloc_uprobe() and we do not want to hold the extra locks around read_mapping_page/etc. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 51 +++++++++++++++---------------------------------- 1 file changed, 15 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 358baddc8ac..c3b65d1c844 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -50,29 +50,6 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 - -/* - * We need separate register/unregister and mmap/munmap lock hashes because - * of mmap_sem nesting. - * - * uprobe_register() needs to install probes on (potentially) all processes - * and thus needs to acquire multiple mmap_sems (consequtively, not - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem - * for the particular process doing the mmap. - * - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem - * because of lock order against i_mmap_mutex. This means there's a hole in - * the register vma iteration where a mmap() can happen. - * - * Thus uprobe_register() can race with uprobe_mmap() and we can try and - * install a probe where one is already installed. - */ - -/* serialize (un)register */ -static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; - -#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) - /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) @@ -865,20 +842,26 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (offset > i_size_read(inode)) return -EINVAL; - ret = -ENOMEM; - mutex_lock(uprobes_hash(inode)); + retry: uprobe = alloc_uprobe(inode, offset); - if (uprobe) { - down_write(&uprobe->register_rwsem); + if (!uprobe) + return -ENOMEM; + /* + * We can race with uprobe_unregister()->delete_uprobe(). + * Check uprobe_is_active() and retry if it is false. + */ + down_write(&uprobe->register_rwsem); + ret = -EAGAIN; + if (likely(uprobe_is_active(uprobe))) { ret = __uprobe_register(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); } - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } @@ -896,11 +879,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (!uprobe) return; - mutex_lock(uprobes_hash(inode)); down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); up_write(&uprobe->register_rwsem); - mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } @@ -1609,10 +1590,8 @@ static int __init init_uprobes(void) { int i; - for (i = 0; i < UPROBES_HASH_SZ; i++) { - mutex_init(&uprobes_mutex[i]); + for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - } if (percpu_init_rwsem(&dup_mmap_sem)) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 806a98bdf2a862fef0fc880399d677b35ba525ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 27 Dec 2012 18:21:11 +0100 Subject: uprobes: Rationalize the usage of filter_chain() filter_chain() was added into install_breakpoint/remove_breakpoint to simplify the initial changes but this is sub-optimal. This patch shifts the callsite to the callers, register_for_each_vma() and uprobe_mmap(). This way: - It will be easier to add the new arguments. This is the main reason, we can do more optimizations later. - register_for_each_vma(is_register => true) can be optimized, we only need to consult the new consumer. The previous consumers were already asked when they called uprobe_register(). This patch also moves the MMF_HAS_UPROBES check from remove_breakpoint(), this allows to avoid the potentionally costly filter_chain(). Note that register_for_each_vma(is_register => false) doesn't really need to take ->consumer_rwsem, but I don't think it makes sense to optimize this and introduce filter_chain_lockless(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c3b65d1c844..33912086d54 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -579,6 +579,11 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } +static inline bool consumer_filter(struct uprobe_consumer *uc) +{ + return true; /* TODO: !uc->filter || uc->filter(...) */ +} + static bool filter_chain(struct uprobe *uprobe) { struct uprobe_consumer *uc; @@ -586,8 +591,7 @@ static bool filter_chain(struct uprobe *uprobe) down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - /* TODO: ret = uc->filter(...) */ - ret = true; + ret = consumer_filter(uc); if (ret) break; } @@ -603,15 +607,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, bool first_uprobe; int ret; - /* - * If probe is being deleted, unregister thread could be done with - * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. But in this case filter_chain() - * must return false, all consumers have gone away. - */ - if (!filter_chain(uprobe)) - return 0; - ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; @@ -636,12 +631,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return 0; - - if (filter_chain(uprobe)) - return 0; - set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -781,10 +770,14 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - else - err |= remove_breakpoint(uprobe, mm, info->vaddr); + if (is_register) { + /* consult only the "caller", new consumer. */ + if (consumer_filter(uprobe->consumers)) + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + if (!filter_chain(uprobe)) + err |= remove_breakpoint(uprobe, mm, info->vaddr); + } unlock: up_write(&mm->mmap_sem); @@ -968,9 +961,14 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - + /* + * We can race with uprobe_unregister(), this uprobe can be already + * removed. But in this case filter_chain() must return false, all + * consumers have gone away. + */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + filter_chain(uprobe)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } -- cgit v1.2.3-70-g09d2 From 8a7f2fa0dea3b019500961b86d765e6fdd4bffb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 28 Dec 2012 17:58:38 +0100 Subject: uprobes: Reintroduce uprobe_consumer->filter() Finally add uprobe_consumer->filter() and change consumer_filter() to actually call this method. Note that ->filter() accepts mm_struct, not task_struct. Because: 1. We do not have for_each_mm_user(mm, task). 2. Even if we implement for_each_mm_user(), ->filter() can use it itself. 3. It is not clear who will actually need this interface to do the "nontrivial" filtering. Another argument is "enum uprobe_filter_ctx", consumer->filter() can use it to figure out why/where it was called. For example, perhaps we can add UPROBE_FILTER_PRE_REGISTER used by build_map_info() to quickly "nack" the unwanted mm's. In this case consumer should know that it is called under ->i_mmap_mutex. See the previous discussion at http://marc.info/?t=135214229700002 Perhaps we should pass more arguments, vma/vaddr? Note: this patch obviously can't help to filter out the child created by fork(), this will be addressed later. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 9 +++++++++ kernel/events/uprobes.c | 18 +++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 83742b91ff7..c2df6934fdc 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -35,8 +35,17 @@ struct inode; # include #endif +enum uprobe_filter_ctx { + UPROBE_FILTER_REGISTER, + UPROBE_FILTER_UNREGISTER, + UPROBE_FILTER_MMAP, +}; + struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + bool (*filter)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); struct uprobe_consumer *next; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 33912086d54..c2737be3c4b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -579,19 +579,21 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc) +static inline bool consumer_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) { - return true; /* TODO: !uc->filter || uc->filter(...) */ + return !uc->filter || uc->filter(uc, ctx, mm); } -static bool filter_chain(struct uprobe *uprobe) +static bool filter_chain(struct uprobe *uprobe, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc); + ret = consumer_filter(uc, ctx, mm); if (ret) break; } @@ -772,10 +774,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(uprobe->consumers)) + if (consumer_filter(uprobe->consumers, + UPROBE_FILTER_REGISTER, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe)) + if (!filter_chain(uprobe, + UPROBE_FILTER_UNREGISTER, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -968,7 +972,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe)) { + filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } -- cgit v1.2.3-70-g09d2 From da1816b1caeccdff04531e763bb35d7caa3ed19f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 29 Dec 2012 17:49:11 +0100 Subject: uprobes: Teach handler_chain() to filter out the probed task Currrently the are 2 problems with pre-filtering: 1. It is not possible to add/remove a task (mm) after uprobe_register() 2. A forked child inherits all breakpoints and uprobe_consumer can not control this. This patch does the first step to improve the filtering. handler_chain() removes the breakpoints installed by this uprobe from current->mm if all handlers return UPROBE_HANDLER_REMOVE. Note that handler_chain() relies on ->register_rwsem to avoid the race with uprobe_register/unregister which can add/del a consumer, or even remove and then insert the new uprobe at the same address. Perhaps we will add uprobe_apply_mm(uprobe, mm, is_register) and teach copy_mm() to do filter(UPROBE_FILTER_FORK), but I think this change makes sense anyway. Note: instead of checking the retcode from uc->handler, we could add uc->filter(UPROBE_FILTER_BPHIT). But I think this is not optimal to call 2 hooks in a row. This buys nothing, and if handler/filter do something nontrivial they will probably do the same work twice. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 58 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index c2df6934fdc..95d0002efda 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -35,6 +35,9 @@ struct inode; # include #endif +#define UPROBE_HANDLER_REMOVE 1 +#define UPROBE_HANDLER_MASK 1 + enum uprobe_filter_ctx { UPROBE_FILTER_REGISTER, UPROBE_FILTER_UNREGISTER, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c2737be3c4b..04c104ad952 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -440,16 +440,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) return uprobe; } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) -{ - struct uprobe_consumer *uc; - - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) - uc->handler(uc, regs); - up_read(&uprobe->register_rwsem); -} - static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); @@ -882,6 +872,33 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume put_uprobe(uprobe); } +static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int err = 0; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long vaddr; + loff_t offset; + + if (!valid_vma(vma, false) || + vma->vm_file->f_mapping->host != uprobe->inode) + continue; + + offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + if (uprobe->offset < offset || + uprobe->offset >= offset + vma->vm_end - vma->vm_start) + continue; + + vaddr = offset_to_vaddr(vma, uprobe->offset); + err |= remove_breakpoint(uprobe, mm, vaddr); + } + up_read(&mm->mmap_sem); + + return err; +} + static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { @@ -1435,6 +1452,27 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + int remove = UPROBE_HANDLER_REMOVE; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + int rc = uc->handler(uc, regs); + + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + remove &= rc; + } + + if (remove && uprobe->consumers) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + up_read(&uprobe->register_rwsem); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. -- cgit v1.2.3-70-g09d2 From 74e59dfc6b19e3472a7c16ad57bc831e6e647895 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 15:54:08 +0100 Subject: uprobes: Change handle_swbp() to expose bp_vaddr to handler_chain() Change handle_swbp() to set regs->ip = bp_vaddr in advance, this is what consumer->handler() needs but uprobe_get_swbp_addr() is not exported. This also simplifies the code and makes it more consistent across the supported architectures. handle_swbp() becomes the only caller of uprobe_get_swbp_addr(). Signed-off-by: Oleg Nesterov Acked-by: Ananth N Mavinakayanahalli --- arch/x86/kernel/uprobes.c | 1 - kernel/events/uprobes.c | 15 +++++++-------- kernel/trace/trace_uprobe.c | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 4e33a35d659..0ba4cfb4f41 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -681,7 +681,6 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) continue; if (auprobe->insn[i] == 0x90) { - regs->ip = uprobe_get_swbp_addr(regs); regs->ip += i + 1; return true; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04c104ad952..f1b807831fc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1504,6 +1504,10 @@ static void handle_swbp(struct pt_regs *regs) } return; } + + /* change it in advance for ->handler() and restart */ + instruction_pointer_set(regs, bp_vaddr); + /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the @@ -1511,14 +1515,14 @@ static void handle_swbp(struct pt_regs *regs) */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) - goto restart; + goto out; utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto restart; + goto out; } handler_chain(uprobe, regs); @@ -1531,12 +1535,7 @@ static void handle_swbp(struct pt_regs *regs) return; } -restart: - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e668024773d..17d9b2bcc28 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -492,7 +492,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -667,7 +667,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -- cgit v1.2.3-70-g09d2 From c8a82538001e1a68f4a319d5a75de90d1f284731 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 17:40:39 +0100 Subject: uprobes: Move alloc_page() from xol_add_vma() to xol_alloc_area() Move alloc_page() from xol_add_vma() to xol_alloc_area() to cleanup the code. This separates the memory allocations and consolidates the -EALREADY cleanups and the error handling. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f1b807831fc..ea2e2a85479 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1041,22 +1041,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct xol_area *area) { - struct mm_struct *mm; - int ret; - - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) - return -ENOMEM; - - ret = -EALREADY; - mm = current->mm; + struct mm_struct *mm = current->mm; + int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (area->vaddr & ~PAGE_MASK) { @@ -1072,11 +1064,8 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; ret = 0; - -fail: + fail: up_write(&mm->mmap_sem); - if (ret) - __free_page(area->page); return ret; } @@ -1104,21 +1093,26 @@ static struct xol_area *xol_alloc_area(void) area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) - return NULL; + goto out; area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); - if (!area->bitmap) - goto fail; + goto free_area; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + goto free_bitmap; init_waitqueue_head(&area->wq); if (!xol_add_vma(area)) return area; -fail: + __free_page(area->page); + free_bitmap: kfree(area->bitmap); + free_area: kfree(area); - + out: return get_xol_area(current->mm); } -- cgit v1.2.3-70-g09d2 From 9b545df809644912552360054c7bbe8b8a9e01fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 16:39:49 +0100 Subject: uprobes: Fold xol_alloc_area() into get_xol_area() Currently only xol_get_insn_slot() does get_xol_area() + xol_alloc_area(), but this will have more users and we do not want to copy-and-paste this code. This patch simply moves xol_alloc_area() into get_xol_area() to simplify the current and future code. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2e2a85479..7e3e5c5b0d8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1070,27 +1070,21 @@ static int xol_add_vma(struct xol_area *area) return ret; } -static struct xol_area *get_xol_area(struct mm_struct *mm) -{ - struct xol_area *area; - - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ - - return area; -} - /* - * xol_alloc_area - Allocate process's xol_area. - * This area will be used for storing instructions for execution out of - * line. + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ -static struct xol_area *xol_alloc_area(void) +static struct xol_area *get_xol_area(void) { + struct mm_struct *mm = current->mm; struct xol_area *area; + area = mm->uprobes_state.xol_area; + if (area) + goto ret; + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1113,7 +1107,10 @@ static struct xol_area *xol_alloc_area(void) free_area: kfree(area); out: - return get_xol_area(current->mm); + area = mm->uprobes_state.xol_area; + ret: + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + return area; } /* @@ -1189,14 +1186,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot unsigned long offset; void *vaddr; - area = get_xol_area(current->mm); - if (!area) { - area = xol_alloc_area(); - if (!area) - return 0; - } - current->utask->xol_vaddr = xol_take_insn_slot(area); + area = get_xol_area(); + if (!area) + return 0; + current->utask->xol_vaddr = xol_take_insn_slot(area); /* * Initialize the slot if xol_vaddr points to valid * instruction slot. -- cgit v1.2.3-70-g09d2 From 5a2df662aafdabffb2cf3adb780a5adf66dfb3bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 17:03:32 +0100 Subject: uprobes: Turn add_utask() into get_utask() Rename add_utask() into get_utask() and change it to allocate on demand to simplify the caller. Like get_xol_area() it will have more users. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7e3e5c5b0d8..16e54d63a9f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1290,23 +1290,18 @@ void uprobe_copy_process(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task. - * Called when the thread hits a breakpoint for the first time. + * Allocate a uprobe_task object for the task if if necessary. + * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ -static struct uprobe_task *add_utask(void) +static struct uprobe_task *get_utask(void) { - struct uprobe_task *utask; - - utask = kzalloc(sizeof *utask, GFP_KERNEL); - if (unlikely(!utask)) - return NULL; - - current->utask = utask; - return utask; + if (!current->utask) + current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + return current->utask; } /* Prepare to single-step probed instruction out of line. */ @@ -1505,13 +1500,9 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; - utask = current->utask; - if (!utask) { - utask = add_utask(); - /* Cannot allocate; re-execute the instruction. */ - if (!utask) - goto out; - } + utask = get_utask(); + if (!utask) + goto out; /* re-execute the instruction. */ handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) -- cgit v1.2.3-70-g09d2 From a6cb3f6d51253e9cf21a38b17c025018117809d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:00:06 +0100 Subject: uprobes: Do not play with utask in xol_get_insn_slot() pre_ssout()->xol_get_insn_slot() path is confusing and buggy. This patch cleanups the code, the next one fixes the bug. Change xol_get_insn_slot() to only allocate the slot and do nothing more, move the initialization of utask->xol_vaddr/vaddr into pre_ssout(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 16e54d63a9f..8d9c5bcb110 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1176,30 +1176,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) } /* - * xol_get_insn_slot - If was not allocated a slot, then - * allocate a slot. + * xol_get_insn_slot - allocate a slot for xol. * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; unsigned long offset; + unsigned long xol_vaddr; void *vaddr; area = get_xol_area(); if (!area) return 0; - current->utask->xol_vaddr = xol_take_insn_slot(area); - /* - * Initialize the slot if xol_vaddr points to valid - * instruction slot. - */ - if (unlikely(!current->utask->xol_vaddr)) + xol_vaddr = xol_take_insn_slot(area); + if (unlikely(!xol_vaddr)) return 0; - current->utask->vaddr = slot_addr; - offset = current->utask->xol_vaddr & ~PAGE_MASK; + /* Initialize the slot */ + offset = xol_vaddr & ~PAGE_MASK; vaddr = kmap_atomic(area->page); memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); kunmap_atomic(vaddr); @@ -1209,7 +1205,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot */ flush_dcache_page(area->page); - return current->utask->xol_vaddr; + return xol_vaddr; } /* @@ -1306,12 +1302,21 @@ static struct uprobe_task *get_utask(void) /* Prepare to single-step probed instruction out of line. */ static int -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) - return 0; + struct uprobe_task *utask; + unsigned long xol_vaddr; + + utask = current->utask; + + xol_vaddr = xol_get_insn_slot(uprobe); + if (!xol_vaddr) + return -ENOMEM; + + utask->xol_vaddr = xol_vaddr; + utask->vaddr = bp_vaddr; - return -EFAULT; + return arch_uprobe_pre_xol(&uprobe->arch, regs); } /* -- cgit v1.2.3-70-g09d2 From aba51024e7159c93914557caaa2b8cda26331091 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:12:48 +0100 Subject: uprobes: Fix utask->xol_vaddr leak in pre_ssout() pre_ssout() should do xol_free_insn_slot() if arch_uprobe_pre_xol() fails, otherwise nobody will free the allocated slot. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d9c5bcb110..0527379dac5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1306,6 +1306,7 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask; unsigned long xol_vaddr; + int err; utask = current->utask; @@ -1316,7 +1317,13 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - return arch_uprobe_pre_xol(&uprobe->arch, regs); + err = arch_uprobe_pre_xol(&uprobe->arch, regs); + if (unlikely(err)) { + xol_free_insn_slot(current); + return err; + } + + return 0; } /* -- cgit v1.2.3-70-g09d2 From 608e7427c0a06de0d70374a9fd7defc8eb228b7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:20:42 +0100 Subject: uprobes: Do not allocate current->utask unnecessary handle_swbp() does get_utask() before can_skip_sstep() for no reason, we do not need ->utask if can_skip_sstep() succeeds. Move get_utask() to pre_ssout() who actually starts to use it. Move the initialization of utask->active_uprobe/state as well. This way the whole initialization is consolidated in pre_ssout(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov --- kernel/events/uprobes.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0527379dac5..071edcb3e62 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1308,7 +1308,9 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) unsigned long xol_vaddr; int err; - utask = current->utask; + utask = get_utask(); + if (!utask) + return -ENOMEM; xol_vaddr = xol_get_insn_slot(uprobe); if (!xol_vaddr) @@ -1323,6 +1325,8 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) return err; } + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; return 0; } @@ -1474,7 +1478,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) */ static void handle_swbp(struct pt_regs *regs) { - struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; int uninitialized_var(is_swbp); @@ -1512,19 +1515,12 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; - utask = get_utask(); - if (!utask) - goto out; /* re-execute the instruction. */ - handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) { - utask->active_uprobe = uprobe; - utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - } /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: -- cgit v1.2.3-70-g09d2 From af4355e91f15812df8608925738c91be57c580dd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:37:11 +0100 Subject: uprobes: Kill the bogus IS_ERR_VALUE(xol_vaddr) check utask->xol_vaddr is either zero or valid, remove the bogus IS_ERR_VALUE() check in xol_free_insn_slot(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 071edcb3e62..f6c7062fb95 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1223,8 +1223,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) return; slot_addr = tsk->utask->xol_vaddr; - - if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + if (unlikely(!slot_addr)) return; area = tsk->mm->uprobes_state.xol_area; -- cgit v1.2.3-70-g09d2 From e8440c1458ba571bc3fac8a6beb53ff604199f5b Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Sun, 13 Jan 2013 19:03:34 +0100 Subject: uprobes: Add exports for module use The original pull message for uprobes (commit 654443e2) noted: This tree includes uprobes support in 'perf probe' - but SystemTap (and other tools) can take advantage of user probe points as well. In order to actually be usable in module-based tools like SystemTap, the interface needs to be exported. This patch first adds the obvious exports for uprobe_register and uprobe_unregister. Then it also adds one for task_user_regset_view, which is necessary to get the correct state of userspace registers. Signed-off-by: Josh Stone Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 3 +++ kernel/ptrace.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f6c7062fb95..221fc58f59e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include /* read_mapping_page */ #include #include +#include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ #include /* try_to_free_swap */ @@ -851,6 +852,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * goto retry; return ret; } +EXPORT_SYMBOL_GPL(uprobe_register); /* * uprobe_unregister - unregister a already registered probe. @@ -871,6 +873,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } +EXPORT_SYMBOL_GPL(uprobe_unregister); static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6cbeaae4406..acbd28424d8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, kiov->iov_len, kiov->iov_base); } +/* + * This is declared in linux/regset.h and defined in machine-dependent + * code. We put the export here, near the primary machine-neutral use, + * to ensure no machine forgets it. + */ +EXPORT_SYMBOL_GPL(task_user_regset_view); #endif int ptrace_request(struct task_struct *child, long request, -- cgit v1.2.3-70-g09d2 From 84d7ed799fd6c1366547d88ddb8188c65de3b94f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:20:45 +0100 Subject: uprobes/tracing: Fix dentry/mount leak in create_trace_uprobe() create_trace_uprobe() does kern_path() to find ->d_inode, but forgets to do path_put(). We can do this right after igrab(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 17d9b2bcc28..06c22bad776 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -253,16 +253,18 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = kstrtoul(arg, 0, &offset); - if (ret) - goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + path_put(&path); + if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + argc -= 2; argv += 2; -- cgit v1.2.3-70-g09d2 From 4161824f18ff4f56f46595a4016c7315dd0d24f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:36:24 +0100 Subject: uprobes/tracing: Fully initialize uprobe_trace_consumer before uprobe_register() probe_event_enable() does uprobe_register() and only after that sets utc->tu and tu->consumer/flags. This can race with uprobe_dispatcher() which can miss these assignments or see them out of order. Nothing really bad can happen, but this doesn't look clean/safe. And this does not allow to use uprobe_consumer->filter() we are going to add, it is called by uprobe_register() and it needs utc->tu. Change this code to initialize everything before uprobe_register(), and reset tu->consumer/flags if it fails. We can't race with event_disable(), the caller holds event_mutex, and if we could the code would be wrong anyway. In fact I think uprobe_trace_consumer should die, it buys nothing but complicates the code. We can simply add uprobe_consumer into trace_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 06c22bad776..15b8eceeddc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,17 +552,18 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; + utc->tu = tu; + tu->consumer = utc; + tu->flags |= flag; + ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { + tu->consumer = NULL; + tu->flags &= ~flag; kfree(utc); - return ret; } - tu->flags |= flag; - utc->tu = tu; - tu->consumer = utc; - - return 0; + return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) -- cgit v1.2.3-70-g09d2 From 7e4e28c53963e6cfa94d8109bb8f5233c5659048 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2013 17:08:47 +0100 Subject: uprobes/tracing: Ensure inode != NULL in create_trace_uprobe() probe_event_enable/disable() check tu->inode != NULL at the start. This is ugly, if igrab() can fail create_trace_uprobe() should not succeed and "postpone" the failure. And S_ISREG(inode->i_mode) check added by d24d7dbf is not safe. Note: alloc_uprobe() should probably check igrab() != NULL as well. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 15b8eceeddc..f7838cfd61b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -256,7 +256,7 @@ static int create_trace_uprobe(int argc, char **argv) inode = igrab(path.dentry->d_inode); path_put(&path); - if (!S_ISREG(inode->i_mode)) { + if (!inode || !S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } @@ -544,7 +544,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) struct uprobe_trace_consumer *utc; int ret = 0; - if (!tu->inode || tu->consumer) + if (tu->consumer) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +568,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->inode || !tu->consumer) + if (!tu->consumer) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3-70-g09d2 From b64b007797c1e6d6b745c93c296ba1d5f4d72d86 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:15:30 +0100 Subject: uprobes/tracing: Introduce is_trace_uprobe_enabled() probe_event_enable/disable() check tu->consumer != NULL to avoid the wrong uprobe_register/unregister(). We are going to kill this pointer and "struct uprobe_trace_consumer", so we add the new helper, is_trace_uprobe_enabled(), which can rely on TP_FLAG_TRACE/TP_FLAG_PROFILE instead. Note: the current logic doesn't look optimal, it is not clear why TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive, we will probably change this later. Also kill the unused TP_FLAG_UPROBE. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_probe.h | 1 - kernel/trace/trace_uprobe.c | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 93370867781..5c7e09d10d7 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 -#define TP_FLAG_UPROBE 8 /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7838cfd61b..d6c6e2a345a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -539,12 +539,17 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) +{ + return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} + static int probe_event_enable(struct trace_uprobe *tu, int flag) { struct uprobe_trace_consumer *utc; int ret = 0; - if (tu->consumer) + if (is_trace_uprobe_enabled(tu)) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +573,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->consumer) + if (!is_trace_uprobe_enabled(tu)) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3-70-g09d2 From a932b7381f81235530c3d0acbd3ba2c7537d78e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:47:23 +0100 Subject: uprobes/tracing: Kill uprobe_trace_consumer, embed uprobe_consumer into trace_uprobe trace_uprobe->consumer and "struct uprobe_trace_consumer" add the unnecessary indirection and complicate the code for no reason. This patch simply embeds uprobe_consumer into "struct trace_uprobe", all other changes only fix the compilation errors. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d6c6e2a345a..9c8babbfd11 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -31,17 +31,11 @@ /* * uprobe event core functions */ -struct trace_uprobe; -struct uprobe_trace_consumer { - struct uprobe_consumer cons; - struct trace_uprobe *tu; -}; - struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; - struct uprobe_trace_consumer *consumer; + struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; @@ -92,6 +86,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) goto error; INIT_LIST_HEAD(&tu->list); + tu->consumer.handler = uprobe_dispatcher; return tu; error: @@ -546,27 +541,15 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) static int probe_event_enable(struct trace_uprobe *tu, int flag) { - struct uprobe_trace_consumer *utc; int ret = 0; if (is_trace_uprobe_enabled(tu)) return -EINTR; - utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); - if (!utc) - return -EINTR; - - utc->cons.handler = uprobe_dispatcher; - utc->tu = tu; - tu->consumer = utc; tu->flags |= flag; - - ret = uprobe_register(tu->inode, tu->offset, &utc->cons); - if (ret) { - tu->consumer = NULL; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) tu->flags &= ~flag; - kfree(utc); - } return ret; } @@ -576,10 +559,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; - kfree(tu->consumer); - tu->consumer = NULL; } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -717,13 +698,9 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { - struct uprobe_trace_consumer *utc; struct trace_uprobe *tu; - utc = container_of(con, struct uprobe_trace_consumer, cons); - tu = utc->tu; - if (!tu || tu->consumer != utc) - return 0; + tu = container_of(con, struct trace_uprobe, consumer); if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3-70-g09d2 From 1b47aefd9b6bd439a4be43c47acd22987ac22db8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:55:27 +0100 Subject: uprobes/perf: Always increment trace_uprobe->nhit Move tu->nhit++ from uprobe_trace_func() to uprobe_dispatcher(). ->nhit counts how many time we hit the breakpoint inserted by this uprobe, we do not want to loose this info if uprobe was enabled by sys_perf_event_open(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9c8babbfd11..c4e29e19fdd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -476,8 +476,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tu->call; - tu->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -701,6 +699,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3-70-g09d2 From f22c1bb6b4706be3502b378cb14564449b15f983 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 2 Feb 2013 16:27:52 +0100 Subject: perf: Introduce hw_perf_event->tp_target and ->tp_list sys_perf_event_open()->perf_init_event(event) is called before find_get_context(event), this means that event->ctx == NULL when class->reg(TRACE_REG_PERF_REGISTER/OPEN) is called and thus it can't know if this event is per-task or system-wide. This patch adds hw_perf_event->tp_target for PERF_TYPE_TRACEPOINT, this is analogous to PERF_TYPE_BREAKPOINT/bp_target we already have. The patch also moves ->bp_target up so that it can overlap with the new member, this can help the compiler to generate the better code. trace_uprobe_register() will use it for prefiltering to avoid the unnecessary breakpoints in mm's we do not want to trace. ->tp_target doesn't have its own reference, but we can rely on the fact that either sys_perf_event_open() holds a reference, or it is equal to event->ctx->task. So this pointer is always valid until free_event(). Also add the "struct list_head tp_list" into this union. It is not strictly necessary, but it can simplify the next changes and we can add it for free. Signed-off-by: Oleg Nesterov --- include/linux/perf_event.h | 9 +++++++-- kernel/events/core.c | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 42adf012145..e47ee462c2f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -135,16 +135,21 @@ struct hw_perf_event { struct { /* software */ struct hrtimer hrtimer; }; + struct { /* tracepoint */ + struct task_struct *tp_target; + /* for tp_event->class */ + struct list_head tp_list; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ - struct arch_hw_breakpoint info; - struct list_head bp_list; /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct task_struct *bp_target; + struct arch_hw_breakpoint info; + struct list_head bp_list; }; #endif }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 301079d06f2..e2d4323c6ae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; + + if (attr->type == PERF_TYPE_TRACEPOINT) + event->hw.tp_target = task; #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * hw_breakpoint is a bit difficult here.. */ - if (attr->type == PERF_TYPE_BREAKPOINT) + else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif } -- cgit v1.2.3-70-g09d2 From bdf8647c44766590ed02f9a84a450a796558b753 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 19:21:12 +0100 Subject: uprobes: Introduce uprobe_apply() Currently it is not possible to change the filtering constraints after uprobe_register(), so a consumer can not, say, start to trace a task/mm which was previously filtered out, or remove the no longer needed bp's. Introduce uprobe_apply() which simply does register_for_each_vma() again to consult uprobe_consumer->filter() and install/remove the breakpoints. The only complication is that register_for_each_vma() can no longer assume that uprobe->consumers should be consulter if is_register == T, so we change it to accept "struct uprobe_consumer *new" instead. Unlike uprobe_register(), uprobe_apply(true) doesn't do "unregister" if register_for_each_vma() fails, it is up to caller to handle the error. Note: we probably need to cleanup the current interface, it is strange that uprobe_apply/unregister need inode/offset. We should either change uprobe_register() to return "struct uprobe *", or add a private ->uprobe member in uprobe_consumer. And in the long term uprobe_apply() should take a single argument, uprobe or consumer, even "bool add" should go away. Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 6 ++++++ kernel/events/uprobes.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 95d0002efda..02b83db8e2c 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -101,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -124,6 +125,11 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { return -ENOSYS; } +static inline int +uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) +{ + return -ENOSYS; +} static inline void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 221fc58f59e..a567c8c7ef3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -733,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) return curr; } -static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +static int +register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { + bool is_register = !!new; struct map_info *info; int err = 0; @@ -765,7 +767,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(uprobe->consumers, + if (consumer_filter(new, UPROBE_FILTER_REGISTER, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { @@ -788,7 +790,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, true); + return register_for_each_vma(uprobe, uc); } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -798,7 +800,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u if (!consumer_del(uprobe, uc)) /* WARN? */ return; - err = register_for_each_vma(uprobe, false); + err = register_for_each_vma(uprobe, NULL); /* TODO : cant unregister? schedule a worker thread */ if (!uprobe->consumers && !err) delete_uprobe(uprobe); @@ -854,6 +856,35 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } EXPORT_SYMBOL_GPL(uprobe_register); +/* + * uprobe_apply - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: consumer which wants to add more or remove some breakpoints + * @add: add or remove the breakpoints + */ +int uprobe_apply(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc, bool add) +{ + struct uprobe *uprobe; + struct uprobe_consumer *con; + int ret = -ENOENT; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return ret; + + down_write(&uprobe->register_rwsem); + for (con = uprobe->consumers; con && con != uc ; con = con->next) + ; + if (con) + ret = register_for_each_vma(uprobe, add ? uc : NULL); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); + + return ret; +} + /* * uprobe_unregister - unregister a already registered probe. * @inode: the file in which the probe has to be removed. -- cgit v1.2.3-70-g09d2 From 736288ba5016e255869c26296014eeff649971c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 20:58:35 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to track the active perf_event's Introduce "struct trace_uprobe_filter" which records the "active" perf_event's attached to ftrace_event_call. For the start we simply use list_head, we can optimize this later if needed. For example, we do not really need to record an event with ->parent != NULL, we can rely on parent->child_list. And we can certainly do some optimizations for the case when 2 events have the same ->tp_target or tp_target->mm. Change trace_uprobe_register() to process TRACE_REG_PERF_OPEN/CLOSE and add/del this perf_event to the list. We can probably avoid any locking, but lets start with the "obvioulsy correct" trace_uprobe_filter->rwlock which protects everything. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4e29e19fdd..2a74a93afda 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,12 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* * uprobe event core functions */ @@ -35,6 +41,7 @@ struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; + struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; @@ -58,6 +65,18 @@ static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -87,6 +106,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); return tu; error: @@ -544,6 +564,8 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) if (is_trace_uprobe_enabled(tu)) return -EINTR; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + tu->flags |= flag; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) @@ -557,6 +579,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; } @@ -632,6 +656,30 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_add(&event->hw.tp_list, &tu->filter.perf_events); + else + tu->filter.nr_systemwide++; + write_unlock(&tu->filter.rwlock); + + return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_del(&event->hw.tp_list); + else + tu->filter.nr_systemwide--; + write_unlock(&tu->filter.rwlock); + + return 0; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -687,6 +735,13 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + #endif default: return 0; -- cgit v1.2.3-70-g09d2 From 31ba334836c0ac0039084859f14a5b96858493dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:11:58 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to pre-filter Finally implement uprobe_perf_filter() which checks ->nr_systemwide or ->perf_events to figure out whether we need to insert the breakpoint. uprobe_perf_open/close are changed to do uprobe_apply(true/false) when the new perf event comes or goes away. Note that currently this is very suboptimal: - uprobe_register() called by TRACE_REG_PERF_REGISTER becomes a heavy nop, consumer->filter() always returns F at this stage. As it was already discussed we need uprobe_register_only() to avoid the costly register_for_each_vma() when possible. - uprobe_apply() is oftenly overkill. Unless "nr_systemwide != 0" changes we need uprobe_apply_mm(), unapply_uprobe() is almost what we need. - uprobe_apply() can be simply avoided sometimes, see the next changes. Testing: # perf probe -x /lib/libc.so.6 syscall # perl -e 'syscall -1 while 1' & [1] 530 # perf record -e probe_libc:syscall perl -e 'syscall -1 for 1..10; sleep 1' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 79291 A huge ->nrhit == 79291 reflects the fact that the background process 530 constantly hits this breakpoint too, even if doesn't contribute to the output. After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 10 This shows that only the target process was punished by int3. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a74a93afda..b7850f535ac 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -557,7 +557,12 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; @@ -567,6 +572,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); tu->flags |= flag; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) tu->flags &= ~flag; @@ -656,6 +662,22 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { write_lock(&tu->filter.rwlock); @@ -665,6 +687,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide++; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + return 0; } @@ -677,9 +701,25 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide--; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return 0; } +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -722,7 +762,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE); + return probe_event_enable(tu, TP_FLAG_TRACE, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(tu, TP_FLAG_TRACE); @@ -730,7 +770,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE); + return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); -- cgit v1.2.3-70-g09d2 From f42d24a1d20d2e72d1e5d48930f18b138dfad117 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:48:34 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to use UPROBE_HANDLER_REMOVE Change uprobe_trace_func() and uprobe_perf_func() to return "int". Change uprobe_dispatcher() to return "trace_ret | perf_ret" although this is not needed, currently TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive. The only functional change is that uprobe_perf_func() checks the filtering too and returns UPROBE_HANDLER_REMOVE if nobody wants to trace current. Testing: # perf probe -x /lib/libc.so.6 syscall # perf record -e probe_libc:syscall -i perl -e 'fork; syscall -1 for 1..10; wait' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 20 A child process doesn't have a counter, but still it hits this breakoint "copied" by dup_mmap(). After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 11 The child process hits this int3 only once and does unapply_uprobe(). Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b7850f535ac..2399f141655 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -486,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { }; /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -504,7 +504,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->ip = instruction_pointer(task_pt_regs(current)); @@ -514,6 +514,8 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; } /* Event entry printers */ @@ -721,7 +723,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, } /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -730,11 +732,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) int size, __size, i; int rctx; + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + __size = sizeof(*entry) + tu->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; + return 0; preempt_disable(); @@ -752,6 +757,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) out: preempt_enable(); + return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -792,18 +798,19 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; if (tu->flags & TP_FLAG_TRACE) - uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS if (tu->flags & TP_FLAG_PROFILE) - uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs); #endif - return 0; + return ret; } static struct trace_event_functions uprobe_funcs = { -- cgit v1.2.3-70-g09d2 From b2fe8ba674e8acbb9e8e63510b802c6d054d88a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 19:05:43 +0100 Subject: uprobes/perf: Avoid uprobe_apply() whenever possible uprobe_perf_open/close call the costly uprobe_apply() every time, we can avoid it if: - "nr_systemwide != 0" is not changed. - There is another process/thread with the same ->mm. - copy_proccess() does inherit_event(). dup_mmap() preserves the inserted breakpoints. - event->attr.enable_on_exec == T, we can rely on uprobe_mmap() called by exec/mmap paths. - tp_target is exiting. Only _close() checks PF_EXITING, I don't think TRACE_REG_PERF_OPEN can hit the dying task too often. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2399f141655..8dad2a92dee 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -680,30 +680,60 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return false; } +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); list_add(&event->hw.tp_list, &tu->filter.perf_events); - else + } else { + done = tu->filter.nr_systemwide; tu->filter.nr_systemwide++; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { list_del(&event->hw.tp_list); - else + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -- cgit v1.2.3-70-g09d2 From 02e176af92f3e2e9ec3a48792036566af2dcd534 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Wed, 6 Feb 2013 23:29:20 +0200 Subject: perf/hwbp: Fix cleanup in case of kzalloc failure Obviously this is a typo and could result in memory leaks if kzalloc fails on a given cpu. Signed-off-by: Daniel Baluta Acked-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1360186160-7566-1-git-send-email-dbaluta@ixiacom.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fe8a916507e..a64f8aeb5c1 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); if (err_cpu == cpu) break; } -- cgit v1.2.3-70-g09d2