From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Tue, 2 Jun 2009 17:06:54 -0700 Subject: ring-buffer: fix bug in ring_buffer_discard_commit There's a bug in ring_buffer_discard_commit. The wrong pointer is being compared in order to check if the event can be freed from the buffer rather than discarded (i.e. marked as PAD). I noticed this when I was working on duration filtering. The bug is not deadly - it just results in lots of wasted space in the buffer. All filtered events are left in the buffer and marked as discarded, rather than being removed from the buffer to make space for other events. Unfortunately, when I fixed this bug, I got errors doing a filtered function trace. Multiple TIME_EXTEND events pile up in the buffer, and trigger the following loop overage warning in rb_iter_peek(): again: ... if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; I'm not sure what the best way is to fix this. I don't know if I should extend the loop threshhold, or if I should make the test more complex (ignore TIME_EXTEND events), or just get rid of this loop check completely. Note that if I implement a workaround for this, then I see another problem from rb_advance_iter(). I haven't tracked that one down yet. In general, it seems like the case of removing filtered events has not been working properly, and so some assumptions about buffer invariant conditions need to be revisited. Here's the patch for the simple fix: Compare correct pointer for checking if an event can be freed rather than left as discarded in the buffer. Signed-off-by: Tim Bird LKML-Reference: <4A25BE9E.5090909@am.sony.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b24d49604..94530236869 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, bpage = cpu_buffer->tail_page; - if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { /* * This is on the tail page. It is possible that * a write could come in and move the tail page -- cgit v1.2.3-70-g09d2 From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 23:00:53 -0400 Subject: ring-buffer: try to discard unneeded timestamps There are times that a race may happen that we add a timestamp in a nested write. This timestamp would just contain a zero delta and serves no purpose. Now that we have a way to discard events, this patch will try to discard the timestamp instead of just wasting the space in the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 67 ++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 94530236869..50926601a28 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) @@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* let the caller know this was the commit */ ret = 1; } else { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - ret = 0; + /* Try to discard the event */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; + } } *delta = 0; @@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; int cpu; /* The event is discarded regardless */ @@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) - goto out; - } + if (!rb_try_to_discard(cpu_buffer, event)) + goto out; /* * The commit is still visible by the reader, so we -- cgit v1.2.3-70-g09d2 From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 09:30:10 -0400 Subject: ring-buffer: discard timestamps that are at the start of the buffer Every buffer page in the ring buffer includes its own time stamp. When an event is recorded to the ring buffer with a delta time greater than what can be held in the event header, a time stamp event is created. If the the create timestamp falls over to the next buffer page, it is redundant because the buffer page holds a full time stamp. This patch will try to discard the time stamp when it falls to the start of the next page. This change also fixes a issues with disarding events. If most events are discarded, timestamps will start to creep into the ring buffer. If we do not discard the timestamps then they can fill up the ring buffer over time and waste space. This change will keep time stamps from filling up over another page. If something is recorded in the buffer page, and the rest is filtered, then the time stamps can only fill up to the end of the page. [ Impact: prevent time stamps from filling ring buffer ] Reported-by: Tim Bird Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 50926601a28..7102d7a2fad 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = *delta >> TS_SHIFT; } else { cpu_buffer->commit_page->page->time_stamp = *ts; - event->time_delta = 0; - event->array[0] = 0; + /* try to discard, since we do not need this */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } } cpu_buffer->write_stamp = *ts; /* let the caller know this was the commit */ @@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - if (RB_WARN_ON(buffer, - iter->head_page == cpu_buffer->commit_page)) + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; @@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) /* * We repeat when a timestamp is encountered. It is possible * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * as one timestamp is about to be written, or from discarded + * commits. The most that we can have is the number on a single page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * We repeat when a timestamp is encountered. + * We can get multiple timestamps by nested interrupts or also + * if filtering is on (discarding commits). Since discarding + * commits can be frequent we can get a lot of timestamps. + * But we limit them by not adding timestamps if they begin + * at the start of a page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) -- cgit v1.2.3-70-g09d2 From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:28 +0800 Subject: tracing/trace_stack: fix the number of entries in the header The last entry in the stack_dump_trace is ULONG_MAX, which is not a valid entry, but max_stack_trace.nr_entries has accounted for it. So when printing the header, we should decrease it by one. Before fix, print as following, for example: Depth Size Location (53 entries) <--- should be 52 ----- ---- -------- 0) 3264 108 update_wall_time+0x4d5/0x9a0 ... 51) 80 80 syscall_call+0x7/0xb ^^^ it's correct. Signed-off-by: walimis LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1796f00524e..2d7aebd71db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + max_stack_trace.nr_entries - 1); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); -- cgit v1.2.3-70-g09d2 From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:29 +0800 Subject: tracing/events: fix output format of kernel stack According to "events/ftrace/kernel_stack/format", output format of kernel stack should use "=>" instead of "<=". The second problem is that we shouldn't skip the first entry in the stack, although it seems to be duplicated when used in the "function" tracer, but events also use it. If we skip the first one, we will drop the topmost entry of the stack. The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should drop it, otherwise it will print a NULL name line. before fix: sh-1072 [000] 26.957239: sched_process_fork: parent sh:1072 child sh:1073 sh-1072 [000] 26.957262: <= syscall_call <= sh-1072 [000] 26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120] sh-1072 [000] 26.957752: <= preempt_schedule <= wake_up_new_task <= do_fork <= sys_clone <= syscall_call <= After fix: sh-1075 [000] 39.791848: sched_process_fork: parent sh:1075 child sh:1076 sh-1075 [000] 39.791871: => sys_clone => syscall_call sh-1075 [000] 39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120] sh-1075 [000] 39.792722: => schedule => preempt_schedule => wake_up_new_task => do_fork => sys_clone => syscall_call Signed-off-by: walimis LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0fe3b223f7e..64596a57160 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); + if (!trace_seq_puts(s, "\n")) + goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i]) + if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) break; - if (i) { - if (!trace_seq_puts(s, " <= ")) - goto partial; + if (!trace_seq_puts(s, " => ")) + goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) - goto partial; - } + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; if (!trace_seq_puts(s, "\n")) goto partial; } -- cgit v1.2.3-70-g09d2 From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:30 +0800 Subject: tracing/events: fix output format of user stack According to "events/ftrace/user_stack/format", fix the output of user stack. before fix: sh-1073 [000] 31.137561: <- <0804e33c> <- <080835c1> after fix: sh-1072 [000] 37.039329: => => <0804e33c> => <080835c1> Signed-off-by: walimis LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64596a57160..8dadbbbd2d5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ip == ULONG_MAX || !ret) break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); + if (ret) + ret = trace_seq_puts(s, " => "); if (!ip) { if (ret) ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "\n"); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); + ret = trace_seq_puts(s, "\n"); } if (mm) @@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_userip_objs(field, s, flags)) + if (!trace_seq_putc(s, '\n')) goto partial; - if (!trace_seq_putc(s, '\n')) + if (!seq_print_userip_objs(field, s, flags)) goto partial; return TRACE_TYPE_HANDLED; -- cgit v1.2.3-70-g09d2 From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 3 Jun 2009 14:52:03 +0100 Subject: tracing: fix multiple use of __print_flags and __print_symbolic Here is an updated patch to include the extra call to trace_seq_init() as requested. This is vs. the latest -tip tree and fixes the use of multiple __print_flags and __print_symbolic in a single tracer. Also tested to ensure its working now: mount.gfs2-2534 [000] 235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR mount.gfs2-2534 [000] 235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI mount.gfs2-2534 [000] 235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX glock_workqueue-2529 [000] 235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI glock_workqueue-2529 [000] 235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I Signed-off-by: Steven Whitehouse LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 2 ++ kernel/trace/trace_output.c | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b5478dab579..40ede4db4d8 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -104,6 +104,7 @@ * field = (typeof(field))entry; * * p = get_cpu_var(ftrace_event_seq); + * trace_seq_init(p); * ret = trace_seq_printf(s, "\n"); * put_cpu(); * if (!ret) @@ -167,6 +168,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ field = (typeof(field))entry; \ \ p = &get_cpu_var(ftrace_event_seq); \ + trace_seq_init(p); \ ret = trace_seq_printf(s, #call ": " print); \ put_cpu(); \ if (!ret) \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8dadbbbd2d5..8afeea412e7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; + const char *ret = p->buffer + p->len; int i; - trace_seq_init(p); - for (i = 0; flag_array[i].name && flags; i++) { mask = flag_array[i].mask; @@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_flags_seq); @@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - - trace_seq_init(p); + const char *ret = p->buffer + p->len; for (i = 0; symbol_array[i].name; i++) { @@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_symbols_seq); -- cgit v1.2.3-70-g09d2 From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 11:10:44 -0400 Subject: tracing: add annotation to what type of stack trace is recorded The current method of printing out a stack trace is to add a new line and print out the trace: yum-updatesd-3120 [002] 573.691303: => do_softirq => irq_exit => smp_apic_timer_interrupt => apic_timer_interrupt This looks a bit awkward, and if we have both stack and user stack traces running, it would be nice to have a title to tell them apart, although it is easy to tell by the output. This patch adds an annotation to the start of the stack traces: init-1 [003] 929.304979: => user_path_at => vfs_fstatat => vfs_stat => sys_newstat => system_call_fastpath cat-3459 [002] 1016.824040: => <0000003aae6c0250> => <00007ffff4b06ae4> => <69636172742f6775> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8afeea412e7..425725c1622 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_puts(s, "\n")) goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) @@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_putc(s, '\n')) + if (!trace_seq_puts(s, "\n")) goto partial; if (!seq_print_userip_objs(field, s, flags)) -- cgit v1.2.3-70-g09d2