From e7e2ee89a9dbf48d70a922d5625cd7320a27cbff Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 10 May 2011 13:27:21 -0700 Subject: tracing: Schedule a delayed work to call wakeup() In using syscall tracing by concurrent processes, the wakeup() that is called in the event commit function causes contention on the spin lock of the waitqueue. I enabled sys_enter_getuid and sys_exit_getuid tracepoints, and by running getuid_microbench from autotest in parallel I found that the contention causes exponential latency increase in the tracing path. The autotest binary getuid_microbench calls getuid() in a tight loop for the given number of iterations and measures the average time required to complete a single invocation of syscall. The patch schedules a delayed work after 2 ms once an event commit calls to wake up the trace wait_queue. This removes the delay caused by contention on spin lock in wakeup() and amortizes the wakeup() calls scheduled over the 2 ms period. In the following example, the script enables the sys_enter_getuid and sys_exit_getuid tracepoints and runs the getuid_microbench in parallel with the given number of processes. The output clearly shows the latency increase caused by contentions. $ ~/getuid.sh 1 1000000 calls in 0.720974253 s (720.974253 ns/call) $ ~/getuid.sh 2 1000000 calls in 1.166457554 s (1166.457554 ns/call) 1000000 calls in 1.168933765 s (1168.933765 ns/call) $ ~/getuid.sh 3 1000000 calls in 1.783827516 s (1783.827516 ns/call) 1000000 calls in 1.795553270 s (1795.553270 ns/call) 1000000 calls in 1.796493376 s (1796.493376 ns/call) $ ~/getuid.sh 4 1000000 calls in 4.483041796 s (4483.041796 ns/call) 1000000 calls in 4.484165388 s (4484.165388 ns/call) 1000000 calls in 4.484850762 s (4484.850762 ns/call) 1000000 calls in 4.485643576 s (4485.643576 ns/call) $ ~/getuid.sh 5 1000000 calls in 6.497521653 s (6497.521653 ns/call) 1000000 calls in 6.502000236 s (6502.000236 ns/call) 1000000 calls in 6.501709115 s (6501.709115 ns/call) 1000000 calls in 6.502124100 s (6502.124100 ns/call) 1000000 calls in 6.502936358 s (6502.936358 ns/call) After the patch, the latencies scale better. 1000000 calls in 0.728720455 s (728.720455 ns/call) 1000000 calls in 0.842782857 s (842.782857 ns/call) 1000000 calls in 0.883803135 s (883.803135 ns/call) 1000000 calls in 0.902077764 s (902.077764 ns/call) 1000000 calls in 0.902838202 s (902.838202 ns/call) 1000000 calls in 0.908896885 s (908.896885 ns/call) 1000000 calls in 0.932523515 s (932.523515 ns/call) 1000000 calls in 0.958009672 s (958.009672 ns/call) 1000000 calls in 0.986188020 s (986.188020 ns/call) 1000000 calls in 0.989771102 s (989.771102 ns/call) 1000000 calls in 0.933518391 s (933.518391 ns/call) 1000000 calls in 0.958897947 s (958.897947 ns/call) 1000000 calls in 1.031038897 s (1031.038897 ns/call) 1000000 calls in 1.089516025 s (1089.516025 ns/call) 1000000 calls in 1.141998347 s (1141.998347 ns/call) Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1305059241-7629-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee9c921d7f2..71777c8fe36 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); +static void wakeup_work_handler(struct work_struct *work) +{ + wake_up(&trace_wait); +} + +static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); + /** * trace_wake_up - wake up tasks waiting for trace input * - * Simply wakes up any task that is blocked on the trace_wait - * queue. These is used with trace_poll for tasks polling the trace. + * Schedules a delayed work to wake up any task that is blocked on the + * trace_wait queue. These is used with trace_poll for tasks polling the + * trace. */ void trace_wake_up(void) { - int cpu; + const unsigned long delay = msecs_to_jiffies(2); if (trace_flags & TRACE_ITER_BLOCK) return; - /* - * The runqueue_is_locked() can fail, but this is the best we - * have for now: - */ - cpu = get_cpu(); - if (!runqueue_is_locked(cpu)) - wake_up(&trace_wait); - put_cpu(); + schedule_delayed_work(&wakeup_work, delay); } static int __init set_buf_size(char *str) -- cgit v1.2.3-70-g09d2 From 7ea5906405a1f3fc1c0033dfd7e02f2cfd1de5e5 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 3 May 2011 17:56:42 -0700 Subject: tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 36 +++++++++++++++++++----------------- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 7 +++---- 4 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index ab38ac80b0f..b891de96000 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -169,7 +169,7 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, size_t ring_buffer_page_len(void *page); -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b0c7aa40794..725153d6cf7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -997,13 +997,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct buffer_page *bpage, *tmp; - unsigned long addr; LIST_HEAD(pages); unsigned i; WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { + struct page *page; + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) @@ -1013,10 +1014,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } @@ -1045,7 +1047,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long addr; + struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), @@ -1067,10 +1069,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto fail_free_reader; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -1314,7 +1316,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) unsigned nr_pages, rm_pages, new_pages; struct buffer_page *bpage, *tmp; unsigned long buffer_size; - unsigned long addr; LIST_HEAD(pages); int i, cpu; @@ -1375,16 +1376,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { + struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) goto free_pages; - bpage->page = (void *)addr; + bpage->page = page_address(page); rb_init_page(bpage->page); } } @@ -3730,16 +3732,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or NULL on error. */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { struct buffer_data_page *bpage; - unsigned long addr; + struct page *page; - addr = __get_free_page(GFP_KERNEL); - if (!addr) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) return NULL; - bpage = (void *)addr; + bpage = page_address(page); rb_init_page(bpage); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a61463..a5457d577b9 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -106,7 +106,7 @@ static enum event_status read_page(int cpu) int inc; int i; - bpage = ring_buffer_alloc_read_page(buffer); + bpage = ring_buffer_alloc_read_page(buffer, cpu); if (!bpage) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71777c8fe36..61fda6b6f1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,7 +3697,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return 0; if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); if (!info->spare) return -ENOMEM; @@ -3854,7 +3854,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer); + ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); if (!ref->page) { kfree(ref); break; @@ -3863,8 +3863,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, r = ring_buffer_read_page(ref->buffer, &ref->page, len, info->cpu, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, - ref->page); + ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); break; } -- cgit v1.2.3-70-g09d2 From 4f271a2a60c748599b30bb4dafff30d770439b96 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Mon, 13 Jun 2011 17:51:57 -0700 Subject: tracing: Add a proc file to stop tracing and free buffer The proc file entry buffer_size_kb is used to set the size of tracing buffer. The memory to expand the buffer size is kernel memory. Consider a use case where tracing is handled by a user space utility, which acts as a gate keeper for tracing requests. In an OOM condition, tracing is considered a low priority task and if the utility gets killed the ring buffer memory cannot be released back to the kernel. This patch adds a proc file called "free_buffer" whose purpose is to stop tracing and free up the ring buffer when it is closed. The user space process can then set the desired size in buffer_size_kb file and open the fd to the "free_buffer" file. Under OOM condition, if the process gets killed, the kernel closes the file descriptor. The release handler stops the tracing and releases the kernel memory automatically. Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Link: http://lkml.kernel.org/r/1308012717-11148-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 108 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61fda6b6f1a..9c557ae6a21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2768,7 +2768,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static int tracing_resize_ring_buffer(unsigned long size) +static int __tracing_resize_ring_buffer(unsigned long size) { int ret; @@ -2820,6 +2820,41 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } +static ssize_t tracing_resize_ring_buffer(unsigned long size) +{ + int cpu, ret = size; + + mutex_lock(&trace_types_lock); + + tracing_stop(); + + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + + if (size != global_trace.entries) + ret = __tracing_resize_ring_buffer(size); + + if (ret < 0) + ret = -ENOMEM; + + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + + tracing_start(); + mutex_unlock(&trace_types_lock); + + return ret; +} + /** * tracing_update_buffers - used by tracing facility to expand ring buffers @@ -2837,7 +2872,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); mutex_unlock(&trace_types_lock); return ret; @@ -2861,7 +2896,7 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) goto out; ret = 0; @@ -3436,7 +3471,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret, cpu; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -3454,48 +3489,43 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (!val) return -EINVAL; - mutex_lock(&trace_types_lock); - - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); - } - /* value is in KB */ val <<= 10; - if (val != global_trace.entries) { - ret = tracing_resize_ring_buffer(val); - if (ret < 0) { - cnt = ret; - goto out; - } - } + ret = tracing_resize_ring_buffer(val); + if (ret < 0) + return ret; *ppos += cnt; - /* If check pages failed, return ENOMEM */ - if (tracing_disabled) - cnt = -ENOMEM; - out: - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } + return cnt; +} - tracing_start(); - mutex_unlock(&trace_types_lock); +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* + * There is no need to read what the user has written, this function + * is just to make sure that there is no error when "echo" is used + */ + + *ppos += cnt; return cnt; } +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp) +{ + /* disable tracing */ + tracing_off(); + /* resize the ring buffer to 0 */ + tracing_resize_ring_buffer(0); + + return 0; +} + static int mark_printk(const char *fmt, ...) { int ret; @@ -3641,6 +3671,11 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_free_buffer_fops = { + .write = tracing_free_buffer_write, + .release = tracing_free_buffer_release, +}; + static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, @@ -4365,6 +4400,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, + &global_trace, &tracing_free_buffer_fops); + trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); -- cgit v1.2.3-70-g09d2 From cf30cf67d6c7592c670ec946d89fc15ee0deb0eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 22:44:07 -0400 Subject: tracing: Add disable_on_free option Add a trace option to disable tracing on free. When this option is set, a write into the free_buffer file will not only shrink the ring buffer down to zero, but it will also disable tracing. Cc: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++-- kernel/trace/trace.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9c557ae6a21..42fdf3adff3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -425,6 +425,7 @@ static const char *trace_options[] = { "graph-time", "record-cmd", "overwrite", + "disable_on_free", NULL }; @@ -3518,8 +3519,9 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { - /* disable tracing */ - tracing_off(); + /* disable tracing ? */ + if (trace_flags & TRACE_ITER_STOP_ON_FREE) + tracing_off(); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(0); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f8591f61..742f545ae18 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -609,6 +609,7 @@ enum trace_iterator_flags { TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, + TRACE_ITER_STOP_ON_FREE = 0x400000, }; /* -- cgit v1.2.3-70-g09d2 From bd38c0e6f98326132a691d73b2056b426423c638 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Tue, 31 May 2011 20:51:55 +0100 Subject: ftrace: Fixed an include coding style issue Removed because was already declared. Braces of struct's coding style fixed. Cc: Frederic Weisbecker Signed-off-by: Paul McQuade Link: http://lkml.kernel.org/r/4DE59711.3090900@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417fcbfa..e1538071660 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,7 +32,6 @@ #include -#include #include #include "trace_output.h" @@ -82,8 +81,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = -{ +static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, }; @@ -785,8 +783,7 @@ static void unregister_ftrace_profiler(void) unregister_ftrace_graph(); } #else -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ +static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; -- cgit v1.2.3-70-g09d2 From 321e68b095addc473ca52ced9acfa59be88f76e6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:47 +0200 Subject: tracing, function_graph: Remove dependency of abstime and duration fields on latency The display of absolute time and duration fields is based on the latency field. This was added during the irqsoff/wakeup tracers graph support changes. It's causing confusion in what fields will be displayed for the function_graph tracer itself. So I'm removing this depency, and adding absolute time and duration fields to the preemptirqsoff preemptoff irqsoff wakeup tracers. With following commands: # echo function_graph > ./current_tracer # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 0) 0.068 us | } /* page_add_file_rmap */ 0) | _raw_spin_unlock() { ... This is what it looks like now: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) 0.068 us | } /* add_preempt_count */ 0) 0.993 us | } /* vfsmount_lock_local_lock */ ... For preemptirqsoff preemptoff irqsoff wakeup tracers, this is what it looked like before: SNIP # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU TASK/PID ||||| DURATION FUNCTION CALLS # | | | ||||| | | | | | | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... This is what it looks like now: SNIP # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS # | | | | |||| | | | | | | 19.847735 | 1) -0 | d..1 0.000 us | acpi_idle_enter_simple(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-2-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 19 +++---------------- kernel/trace/trace_irqsoff.c | 4 +++- kernel/trace/trace_sched_wakeup.c | 4 +++- 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 962cdb24ed8..352652eb9aa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1207,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -__print_graph_function_flags(struct trace_iterator *iter, u32 flags) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1270,18 +1270,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return __print_graph_function_flags(iter, tracer_flags.val); -} - -enum print_line_t print_graph_function_flags(struct trace_iterator *iter, - u32 flags) -{ - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - return __print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, tracer_flags.val); } static enum print_line_t @@ -1364,9 +1353,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) return; print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; + } __print_graph_headers_flags(s, flags); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c77424be284..667aa8cc0cf 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ - TRACE_GRAPH_PRINT_PROC) + TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f029dd4fd2c..e4a70c0c71b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter) graph_trace_close(iter); } -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_DURATION) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { -- cgit v1.2.3-70-g09d2 From ffeb80fc30acbf6bd51cb47a1815f621a9d017dc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:48 +0200 Subject: tracing, function_graph: Merge overhead and duration display functions Functions print_graph_overhead() and print_graph_duration() displays data for one field - DURATION. I merged them into single function print_graph_duration(), and added a way to display the empty parts of the field. This way the print_graph_irq() function can use this column to display the IRQ signs if needed and the DURATION field details stays inside the print_graph_duration() function. Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-3-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 148 +++++++++++++++++------------------ 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 352652eb9aa..44b955fd87b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = { static struct trace_array *graph_array; +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { + DURATION_FILL_FULL = -1, + DURATION_FILL_START = -2, + DURATION_FILL_END = -3, +}; + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s, - u32 flags) -{ - /* If duration disappear, we don't need anything */ - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return 1; - - /* Non nested entry or return */ - if (duration == -1) - return trace_seq_printf(s, " "); - - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - return trace_seq_printf(s, "! "); - - /* Duration exceeded 10 msecs */ - if (duration > 10000ULL) - return trace_seq_printf(s, "+ "); - } - - return trace_seq_printf(s, " "); -} - static int print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; @@ -650,9 +638,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + ret = print_graph_duration(DURATION_FILL_START, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; if (type == TRACE_GRAPH_ENT) ret = trace_seq_printf(s, "==========>"); @@ -662,9 +650,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Don't close the duration column if haven't one */ - if (flags & TRACE_GRAPH_PRINT_DURATION) - trace_seq_printf(s, " |"); + ret = print_graph_duration(DURATION_FILL_END, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; + ret = trace_seq_printf(s, "\n"); if (!ret) @@ -716,9 +705,48 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) } static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +print_graph_duration(unsigned long long duration, struct trace_seq *s, + u32 flags) { - int ret; + int ret = -1; + + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) + return TRACE_TYPE_HANDLED; + + /* No real adata, just filling the column with spaces */ + switch (duration) { + case DURATION_FILL_FULL: + ret = trace_seq_printf(s, " | "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_START: + ret = trace_seq_printf(s, " "); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + case DURATION_FILL_END: + ret = trace_seq_printf(s, " |"); + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + } + + /* Signal a overhead of time execution to the output */ + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + ret = trace_seq_printf(s, "! "); + /* Duration exceeded 10 msecs */ + else if (duration > 10000ULL) + ret = trace_seq_printf(s, "+ "); + } + + /* + * The -1 means we either did not exceed the duration tresholds + * or we dont want to print out the overhead. Either way we need + * to fill out the space. + */ + if (ret == -1) + ret = trace_seq_printf(s, " "); + + /* Catching here any failure happenned above */ + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; ret = trace_print_graph_duration(duration, s); if (ret != TRACE_TYPE_HANDLED) @@ -767,18 +795,11 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = 0; } - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -815,17 +836,10 @@ print_graph_entry_nested(struct trace_iterator *iter, cpu_data->enter_funcs[call->depth] = call->func; } - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -1078,18 +1092,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* Overhead */ - ret = print_graph_overhead(duration, s, flags); - if (!ret) + /* Overhead and duration */ + ret = print_graph_duration(duration, s, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Duration */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -1146,17 +1153,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; - /* No overhead */ - ret = print_graph_overhead(-1, s, flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* No time */ - if (flags & TRACE_GRAPH_PRINT_DURATION) { - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; /* Indentation */ if (depth > 0) -- cgit v1.2.3-70-g09d2 From f56e7f8efb4ec200364f690a9902713410e24d47 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:49 +0200 Subject: tracing, function: Fix trace header to follow context-info option The header display of function tracer does not follow the context-info option, so field names are displayed even if this option is off. Added check for TRACE_ITER_CONTEXT_INFO trace_flags. With following commands: # echo function > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | add_preempt_count <-schedule rcu_note_context_switch <-schedule ... This is what it looks like now: # tracer: function # _raw_spin_unlock_irqrestore <-hrtimer_try_to_cancel ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-4-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42fdf3adff3..cf22b4bf989 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,6 +2053,9 @@ void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v1.2.3-70-g09d2 From 199abfab40389963b397c2982222e68ea782b2cf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:50 +0200 Subject: tracing, function_graph: Remove lock-depth from latency trace The lock_depth was removed in commit e6e1e25 tracing: Remove lock_depth from event entry Removing the lock_depth info from function_graph latency header. With following commands: # echo function_graph > ./current_tracer # echo 1 > options/latency-format # cat trace This is what it looked like before: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59756/311298, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> lock-depth # |||| / # CPU||||| DURATION FUNCTION CALLS # | ||||| | | | | | | 0) .... 0.068 us | } /* __rcu_read_unlock */ ... This is what it looks like now: # tracer: function_graph # # function_graph latency trace v1.1.5 on 3.0.0-rc1-tip+ # -------------------------------------------------------------------- # latency: 0 us, #59747/1744610, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # CPU|||| DURATION FUNCTION CALLS # | |||| | | | | | | 0) ..s. 1.641 us | } /* __rcu_process_callbacks */ ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-5-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 44b955fd87b..1da5b97d228 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1298,8 +1298,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); - seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); - seq_printf(s, "#%.*s|||| / \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); } static void __print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1318,7 +1317,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " TASK/PID "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); @@ -1332,7 +1331,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " | | "); if (lat) - seq_printf(s, "|||||"); + seq_printf(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); -- cgit v1.2.3-70-g09d2 From 749230b06a753a22f6ed96e5dd60815d6ab12865 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 3 Jun 2011 16:58:51 +0200 Subject: tracing, function_graph: Add context-info support for function_graph tracer The function_graph tracer does not follow global context-info option. Adding TRACE_ITER_CONTEXT_INFO trace_flags check to enable it. With following commands: # echo function_graph > ./current_tracer # echo 0 > options/context-info # cat trace This is what it looked like before: # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 1) 0.079 us | } /* __vma_link_rb */ 1) 0.056 us | copy_page_range(); 1) | security_vm_enough_memory() { ... This is what it looks like now: # tracer: function_graph # } /* update_ts_time_stats */ timekeeping_max_deferment(); ... Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/r/1307113131-10045-6-git-send-email-jolsa@redhat.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 53 +++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 1da5b97d228..e8d6bb55d71 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -613,28 +613,30 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; - /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } - /* Proc */ - if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } } /* No overhead */ @@ -710,8 +712,9 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, { int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION)) - return TRACE_TYPE_HANDLED; + if (!(flags & TRACE_GRAPH_PRINT_DURATION) || + !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ switch (duration) { @@ -879,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return 0; + /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); @@ -1346,6 +1352,9 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + if (trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) -- cgit v1.2.3-70-g09d2 From 22fe9b54d859e53bfbbbdc1a0a77a82bc453927c Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Tue, 7 Jun 2011 21:58:27 +0200 Subject: tracing: Convert to kstrtoul_from_user This patch replaces the code for getting an unsigned long from a userspace buffer by a simple call to kstroul_from_user. This makes it easier to read and less error prone. Signed-off-by: Peter Huewe Link: http://lkml.kernel.org/r/1307476707-14762-1-git-send-email-peterhuewe@gmx.de Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 13 ++------- kernel/trace/ring_buffer.c | 13 ++------- kernel/trace/trace.c | 65 +++++++-------------------------------------- kernel/trace/trace_events.c | 26 +++--------------- kernel/trace/trace_stack.c | 13 ++------- 5 files changed, 20 insertions(+), 110 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e1538071660..458018a1ac9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -803,19 +803,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; /* big enough to hold a number */ int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 725153d6cf7..f00ede314eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3980,20 +3980,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *p = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cf22b4bf989..c977018e87c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2706,20 +2706,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; val = !!val; @@ -3006,20 +2997,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *ptr = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; *ptr = val * 1000; @@ -3474,19 +3456,10 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; /* must have at least 1 entry */ @@ -4139,19 +4112,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, { struct trace_option_dentry *topt = filp->private_data; unsigned long val; - char buf[64]; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) @@ -4199,20 +4163,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { long index = (long)filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; if (val != 0 && val != 1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec399f2a..4d7e1498ae9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -486,20 +486,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64]; unsigned long val; int ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); @@ -571,19 +562,10 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char buf[64]; ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) return ret; ret = tracing_update_buffers(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b0b53b8e4c2..77575b386d9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, { long *ptr = filp->private_data; unsigned long val, flags; - char buf[64]; int ret; int cpu; - if (count >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) + ret = kstrtoul_from_user(ubuf, count, 10, &val); + if (ret) return ret; local_irq_save(flags); -- cgit v1.2.3-70-g09d2 From d7ec4bfed6c97405c6417970ba06c439e08ab8e3 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Tue, 7 Jun 2011 17:01:42 -0700 Subject: ring-buffer: Set __GFP_NORETRY flag for ring buffer allocating process The tracing ring buffer is allocated from kernel memory. While allocating a large chunk of memory, OOM might happen which destabilizes the system. Thus random processes might get killed during the allocation. This patch adds __GFP_NORETRY flag to the ring buffer allocation calls to make it fail more gracefully if the system will not be able to complete the allocation request. Acked-by: David Rientjes Signed-off-by: Vaibhav Nagarnaik Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Michael Rubin Cc: David Sharp Link: http://lkml.kernel.org/r/1307491302-9236-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f00ede314eb..731201bf4ac 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1004,9 +1004,14 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { struct page *page; - + /* + * __GFP_NORETRY flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is + * not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; @@ -1015,7 +1020,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, &pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - GFP_KERNEL, 0); + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1377,13 +1382,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { struct page *page; + /* + * __GFP_NORETRY flag makes sure that the allocation + * fails gracefully without invoking oom-killer and + * the system is not destabilized. + */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL | __GFP_NORETRY, + cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -3737,7 +3749,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) struct buffer_data_page *bpage; struct page *page; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); if (!page) return NULL; -- cgit v1.2.3-70-g09d2 From 1fd8df2c3970c9e7e4e262354154ee39e58bdd7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:34 +0900 Subject: tracing/kprobes: Fix kprobe-tracer to support stack trace Fix to support kernel stack trace correctly on kprobe-tracer. Since the execution path of kprobe-based dynamic events is different from other tracepoint-based events, normal ftrace_trace_stack() doesn't work correctly. To fix that, this introduces ftrace_trace_stack_regs() which traces stack via pt_regs instead of current stack register. e.g. # echo p schedule+4 > /sys/kernel/debug/tracing/kprobe_events # echo 1 > /sys/kernel/debug/tracing/options/stacktrace # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable # head -n 20 /sys/kernel/debug/tracing/trace bash-2968 [000] 10297.050245: p_schedule_4: (schedule+0x4/0x4ca) bash-2968 [000] 10297.050247: => schedule_timeout => n_tty_read => tty_read => vfs_read => sys_read => system_call_fastpath kworker/0:1-2940 [000] 10297.050265: p_schedule_4: (schedule+0x4/0x4ca) kworker/0:1-2940 [000] 10297.050266: => worker_thread => kthread => kernel_thread_helper sshd-1132 [000] 10297.050365: p_schedule_4: (schedule+0x4/0x4ca) sshd-1132 [000] 10297.050365: => sysret_careful Note: Even with this fix, the first entry will be skipped if the probe is put on the function entry area before the frame pointer is set up (usually, that is 4 bytes (push %bp; mov %sp %bp) on x86), because stack unwinder depends on the frame pointer. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070934.17777.17116.stgit@fedora15 Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 ++++ kernel/trace/trace.c | 34 +++++++++++++++++++++++++++++----- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_kprobe.c | 6 ++++-- 4 files changed, 46 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 59d3ef100eb..b1e69eefc20 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -129,6 +129,10 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c977018e87c..d9c16123f6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) +{ + ring_buffer_unlock_commit(buffer, event); + + ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_userstack(buffer, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); + void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { @@ -1238,7 +1250,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) + int skip, int pc, struct pt_regs *regs) { struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; @@ -1257,24 +1269,36 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; trace.entries = entry->caller; - save_stack_trace(&trace); + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); } +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc, regs); +} + void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); } /** @@ -1290,7 +1314,7 @@ void trace_dump_stack(void) local_save_flags(flags); /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 742f545ae18..a3e2db70807 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -389,6 +389,9 @@ void update_max_tr_single(struct trace_array *tr, void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc); +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc, struct pt_regs *regs); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); @@ -400,6 +403,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, { } +static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, + unsigned long flags, int skip, + int pc, struct pt_regs *regs) +{ +} + static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45f0af..7053ef3d73d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1397,7 +1397,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Kretprobe handler */ @@ -1429,7 +1430,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_nowake_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Event entry printers */ -- cgit v1.2.3-70-g09d2