From d8ec91850efaf6cee9234c80260fe03881242374 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 19 Aug 2009 21:13:57 +0200 Subject: tracing: Add kprobe-based event tracer documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the documentation to use the kprobe based event tracer. [fweisbec@gmail.com: Split tracer and its Documentation in two patchs] Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203510.31965.29123.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 139 ++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 Documentation/trace/kprobetrace.txt (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt new file mode 100644 index 00000000000..efff6eb1b3d --- /dev/null +++ b/Documentation/trace/kprobetrace.txt @@ -0,0 +1,139 @@ + Kprobe-based Event Tracer + ========================= + + Documentation is written by Masami Hiramatsu + + +Overview +-------- +This tracer is similar to the events tracer which is based on Tracepoint +infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe +and kretprobe). It probes anywhere where kprobes can probe(this means, all +functions body except for __kprobes functions). + +Unlike the function tracer, this tracer can probe instructions inside of +kernel functions. It allows you to check which instruction has been executed. + +Unlike the Tracepoint based events tracer, this tracer can add and remove +probe points on the fly. + +Similar to the events tracer, this tracer doesn't need to be activated via +current_tracer, instead of that, just set probe points via +/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each +probe events via /sys/kernel/debug/tracing/events/kprobes//filter. + + +Synopsis of kprobe_events +------------------------- + p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : Set a probe + r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + + EVENT : Event name. + SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted. + MEMADDR : Address where the probe is inserted. + + FETCHARGS : Arguments. + %REG : Fetch register REG + sN : Fetch Nth entry of stack (N >= 0) + sa : Fetch stack address. + @ADDR : Fetch memory at ADDR (ADDR should be in kernel) + @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) + aN : Fetch function argument. (N >= 0)(*) + rv : Fetch return value.(**) + ra : Fetch return address.(**) + +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) + + (*) aN may not correct on asmlinkaged functions and at the middle of + function body. + (**) only for return probe. + (***) this is useful for fetching a field of data structures. + + +Per-Probe Event Filtering +------------------------- + Per-probe event filtering feature allows you to set different filter on each +probe and gives you what arguments will be shown in trace buffer. If an event +name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds +an event under tracing/events/kprobes/, at the directory you can see +'id', 'enabled', 'format' and 'filter'. + +enabled: + You can enable/disable the probe by writing 1 or 0 on it. + +format: + It shows the format of this probe event. It also shows aliases of arguments + which you specified to kprobe_events. + +filter: + You can write filtering rules of this event. And you can use both of aliase + names and field names for describing filters. + + +Usage examples +-------------- +To add a probe as a new event, write a new definition to kprobe_events +as below. + + echo p:myprobe do_sys_open a0 a1 a2 a3 > /sys/kernel/debug/tracing/kprobe_events + + This sets a kprobe on the top of do_sys_open() function with recording +1st to 4th arguments as "myprobe" event. + + echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events + + This sets a kretprobe on the return point of do_sys_open() function with +recording return value and return address as "myretprobe" event. + You can see the format of these events via +/sys/kernel/debug/tracing/events/kprobes//format. + + cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format +name: myprobe +ID: 23 +format: + field:unsigned short common_type; offset:0; size:2; + field:unsigned char common_flags; offset:2; size:1; + field:unsigned char common_preempt_count; offset:3; size:1; + field:int common_pid; offset:4; size:4; + field:int common_tgid; offset:8; size:4; + + field: unsigned long ip; offset:16;tsize:8; + field: int nargs; offset:24;tsize:4; + field: unsigned long arg0; offset:32;tsize:8; + field: unsigned long arg1; offset:40;tsize:8; + field: unsigned long arg2; offset:48;tsize:8; + field: unsigned long arg3; offset:56;tsize:8; + + alias: a0; original: arg0; + alias: a1; original: arg1; + alias: a2; original: arg2; + alias: a3; original: arg3; + +print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3 + + + You can see that the event has 4 arguments and alias expressions +corresponding to it. + + echo > /sys/kernel/debug/tracing/kprobe_events + + This clears all probe points. and you can see the traced information via +/sys/kernel/debug/tracing/trace. + + cat /sys/kernel/debug/tracing/trace +# tracer: nop +# +# TASK-PID CPU# TIMESTAMP FUNCTION +# | | | | | + <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 + <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a + <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 + <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a + <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 + <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a + + + Each line shows when the kernel hits a probe, and <- SYMBOL means kernel +returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel +returns from do_sys_open to sys_open+0x1b). + + -- cgit v1.2.3-70-g09d2 From a82378d8802717b9776a7d9b54422f65c414d6cc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:35:18 -0400 Subject: tracing: Kprobe-tracer supports more than 6 arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support up to 128 arguments to fetch for each kprobes event. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203518.31965.96979.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 2 +- kernel/trace/trace_kprobe.c | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index efff6eb1b3d..c9c09b45038 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -32,7 +32,7 @@ Synopsis of kprobe_events SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. - FETCHARGS : Arguments. + FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) sa : Fetch stack address. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0c4f00aafb9..6d488efd16b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -32,7 +32,7 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_KPROBE_ARGS 6 +#define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 /* currently, trace_kprobe only supports X86. */ @@ -184,11 +184,15 @@ struct trace_probe { struct kretprobe rp; }; const char *symbol; /* symbol name */ - unsigned int nr_args; - struct fetch_func args[TRACE_KPROBE_ARGS]; struct ftrace_event_call call; + unsigned int nr_args; + struct fetch_func args[]; }; +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct fetch_func) * (n))) + static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -263,11 +267,11 @@ static DEFINE_MUTEX(probe_lock); static LIST_HEAD(probe_list); static struct trace_probe *alloc_trace_probe(const char *symbol, - const char *event) + const char *event, int nargs) { struct trace_probe *tp; - tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL); + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); if (!tp) return ERR_PTR(-ENOMEM); @@ -573,9 +577,10 @@ static int create_trace_probe(int argc, char **argv) if (offset && is_return) return -EINVAL; } + argc -= 2; argv += 2; /* setup a probe */ - tp = alloc_trace_probe(symbol, event); + tp = alloc_trace_probe(symbol, event, argc); if (IS_ERR(tp)) return PTR_ERR(tp); @@ -594,8 +599,8 @@ static int create_trace_probe(int argc, char **argv) kp->addr = addr; /* parse arguments */ - argc -= 2; argv += 2; ret = 0; - for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) { + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { if (strlen(argv[i]) > MAX_ARGSTR_LEN) { pr_info("Argument%d(%s) is too long.\n", i, argv[i]); ret = -ENOSPC; -- cgit v1.2.3-70-g09d2 From 4263565d491145b57621a761714f2ca6f1293a45 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:35:26 -0400 Subject: tracing: Generate names for each kprobe event automatically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generate names for each kprobe event based on the probe point. (SYMBOL+offs or MEMADDR). Also remove generic k*probe event types because there is no user of those types. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203526.31965.56672.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 3 +- kernel/trace/trace_event_types.h | 18 ----------- kernel/trace/trace_kprobe.c | 64 +++++++++++++++++++------------------ 3 files changed, 35 insertions(+), 50 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index c9c09b45038..5e59e854e71 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -28,7 +28,8 @@ Synopsis of kprobe_events p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : Set a probe r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe - EVENT : Event name. + EVENT : Event name. If omitted, the event name is generated + based on SYMBOL+offs or MEMADDR. SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 186b598a1f1..e74f0906ab1 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -175,22 +175,4 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, TP_RAW_FMT("type:%u call_site:%lx ptr:%p") ); -TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(int, nargs, nargs) - TRACE_FIELD_ZERO(unsigned long, args) - ), - TP_RAW_FMT("%08lx: args:0x%lx ...") -); - -TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, func, func) - TRACE_FIELD(unsigned long, ret_ip, ret_ip) - TRACE_FIELD(int, nargs, nargs) - TRACE_FIELD_ZERO(unsigned long, args) - ), - TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...") -); #undef TRACE_SYSTEM diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6d488efd16b..8aeb24cc295 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -34,6 +34,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 /* currently, trace_kprobe only supports X86. */ @@ -280,11 +281,11 @@ static struct trace_probe *alloc_trace_probe(const char *symbol, if (!tp->symbol) goto error; } - if (event) { - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) - goto error; - } + if (!event) + goto error; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; INIT_LIST_HEAD(&tp->list); return tp; @@ -314,7 +315,7 @@ static struct trace_probe *find_probe_event(const char *event) struct trace_probe *tp; list_for_each_entry(tp, &probe_list, list) - if (tp->call.name && !strcmp(tp->call.name, event)) + if (!strcmp(tp->call.name, event)) return tp; return NULL; } @@ -330,8 +331,7 @@ static void __unregister_trace_probe(struct trace_probe *tp) /* Unregister a trace_probe and probe_event: call with locking probe_lock */ static void unregister_trace_probe(struct trace_probe *tp) { - if (tp->call.name) - unregister_probe_event(tp); + unregister_probe_event(tp); __unregister_trace_probe(tp); list_del(&tp->list); } @@ -360,18 +360,16 @@ static int register_trace_probe(struct trace_probe *tp) goto end; } /* register as an event */ - if (tp->call.name) { - old_tp = find_probe_event(tp->call.name); - if (old_tp) { - /* delete old event */ - unregister_trace_probe(old_tp); - free_trace_probe(old_tp); - } - ret = register_probe_event(tp); - if (ret) { - pr_warning("Faild to register probe event(%d)\n", ret); - __unregister_trace_probe(tp); - } + old_tp = find_probe_event(tp->call.name); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + __unregister_trace_probe(tp); } list_add_tail(&tp->list, &probe_list); end: @@ -580,7 +578,18 @@ static int create_trace_probe(int argc, char **argv) argc -= 2; argv += 2; /* setup a probe */ - tp = alloc_trace_probe(symbol, event, argc); + if (!event) { + /* Make a new event name */ + char buf[MAX_EVENT_NAME_LEN]; + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + is_return ? 'r' : 'p', addr); + tp = alloc_trace_probe(symbol, buf, argc); + } else + tp = alloc_trace_probe(symbol, event, argc); if (IS_ERR(tp)) return PTR_ERR(tp); @@ -661,8 +670,7 @@ static int probes_seq_show(struct seq_file *m, void *v) char buf[MAX_ARGSTR_LEN + 1]; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); - if (tp->call.name) - seq_printf(m, ":%s", tp->call.name); + seq_printf(m, ":%s", tp->call.name); if (tp->symbol) seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp)); @@ -780,10 +788,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) struct ring_buffer_event *event; int size, i, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &event_kprobe; - - if (&tp->call.name) - call = &tp->call; + struct ftrace_event_call *call = &tp->call; local_save_flags(irq_flags); pc = preempt_count(); @@ -815,10 +820,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, struct ring_buffer_event *event; int size, i, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &event_kretprobe; - - if (&tp->call.name) - call = &tp->call; + struct ftrace_event_call *call = &tp->call; local_save_flags(irq_flags); pc = preempt_count(); -- cgit v1.2.3-70-g09d2 From cd7e7bd5e44718c7625ce1e1f0fda53d77cd3797 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:35:42 -0400 Subject: tracing: Add kprobes event profiling interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add profiling interfaces for each kprobes event. This interface provides how many times each probe hit or missed. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203541.31965.8452.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 8 +++++++ kernel/trace/trace_kprobe.c | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 5e59e854e71..3de75174716 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -70,6 +70,14 @@ filter: names and field names for describing filters. +Event Profiling +--------------- + You can check the total number of probe hits and probe miss-hits via +/sys/kernel/debug/tracing/kprobe_profile. + The first column is event name, the second is the number of probe hits, +the third is the number of probe miss-hits. + + Usage examples -------------- To add a probe as a new event, write a new definition to kprobe_events diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9c067bf47d5..ce68197767d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -184,6 +184,7 @@ struct trace_probe { struct kprobe kp; struct kretprobe rp; }; + unsigned long nhit; const char *symbol; /* symbol name */ struct ftrace_event_call call; struct trace_event event; @@ -781,6 +782,37 @@ static const struct file_operations kprobe_events_ops = { .write = probes_write, }; +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + probe_is_return(tp) ? tp->rp.kp.nmissed : tp->kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + /* Kprobe handler */ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { @@ -791,6 +823,8 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; + tp->nhit++; + local_save_flags(irq_flags); pc = preempt_count(); @@ -1140,9 +1174,18 @@ static __init int init_kprobe_trace(void) entry = debugfs_create_file("kprobe_events", 0644, d_tracer, NULL, &kprobe_events_ops); + /* Event list interface */ if (!entry) pr_warning("Could not create debugfs " "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); -- cgit v1.2.3-70-g09d2 From 2fba0c8867af47f6455490e7b59e512dd180c027 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:14 -0400 Subject: tracing/kprobes: Fix probe offset to be unsigned Prohibit user to specify negative offset from symbols. Since kprobe.offset is unsigned int, the offset must be always positive value. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235314.22412.64631.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 14 +++++++------- kernel/trace/trace_kprobe.c | 19 +++++++------------ 2 files changed, 14 insertions(+), 19 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 3de75174716..db553186564 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -25,15 +25,15 @@ probe events via /sys/kernel/debug/tracing/events/kprobes//filter. Synopsis of kprobe_events ------------------------- - p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : Set a probe - r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + p[:EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe - EVENT : Event name. If omitted, the event name is generated - based on SYMBOL+offs or MEMADDR. - SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted. - MEMADDR : Address where the probe is inserted. + EVENT : Event name. If omitted, the event name is generated + based on SYMBOL+offs or MEMADDR. + SYMBOL[+offs] : Symbol+offset where the probe is inserted. + MEMADDR : Address where the probe is inserted. - FETCHARGS : Arguments. Each probe can have up to 128 args. + FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) sa : Fetch stack address. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 19a6de63b44..c24b7e9d97c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -210,7 +210,7 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } -static __kprobes long probe_offset(struct trace_probe *tp) +static __kprobes unsigned int probe_offset(struct trace_probe *tp) { return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset; } @@ -380,7 +380,7 @@ end: } /* Split symbol and offset. */ -static int split_symbol_offset(char *symbol, long *offset) +static int split_symbol_offset(char *symbol, unsigned long *offset) { char *tmp; int ret; @@ -389,16 +389,11 @@ static int split_symbol_offset(char *symbol, long *offset) return -EINVAL; tmp = strchr(symbol, '+'); - if (!tmp) - tmp = strchr(symbol, '-'); - if (tmp) { /* skip sign because strict_strtol doesn't accept '+' */ - ret = strict_strtol(tmp + 1, 0, offset); + ret = strict_strtoul(tmp + 1, 0, offset); if (ret) return ret; - if (*tmp == '-') - *offset = -(*offset); *tmp = '\0'; } else *offset = 0; @@ -520,7 +515,7 @@ static int create_trace_probe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:EVENT] SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS] + * - Add kprobe: p[:EVENT] SYMBOL[+OFFS]|ADDRESS [FETCHARGS] * - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS] * Fetch args: * aN : fetch Nth of function argument. (N:0-) @@ -539,7 +534,7 @@ static int create_trace_probe(int argc, char **argv) int i, ret = 0; int is_return = 0; char *symbol = NULL, *event = NULL; - long offset = 0; + unsigned long offset = 0; void *addr = NULL; if (argc < 2) @@ -605,7 +600,7 @@ static int create_trace_probe(int argc, char **argv) if (tp->symbol) { kp->symbol_name = tp->symbol; - kp->offset = offset; + kp->offset = (unsigned int)offset; } else kp->addr = addr; @@ -675,7 +670,7 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, ":%s", tp->call.name); if (tp->symbol) - seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp)); + seq_printf(m, " %s+%u", probe_symbol(tp), probe_offset(tp)); else seq_printf(m, " 0x%p", probe_address(tp)); -- cgit v1.2.3-70-g09d2 From e08d1c657f70bcaca11401cd6ac5c8fe59bd2bb7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:30 -0400 Subject: tracing/kprobes: Add event profiling support Add *probe_profile_enable/disable to support kprobes raw events sampling from perf counters, like other ftrace events, when CONFIG_PROFILE_EVENT=y. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235329.22412.94731.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 4 +- kernel/trace/trace_kprobe.c | 110 +++++++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index db553186564..8f882ebd136 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -62,13 +62,15 @@ enabled: You can enable/disable the probe by writing 1 or 0 on it. format: - It shows the format of this probe event. It also shows aliases of arguments + This shows the format of this probe event. It also shows aliases of arguments which you specified to kprobe_events. filter: You can write filtering rules of this event. And you can use both of aliase names and field names for describing filters. +id: + This shows the id of this probe event. Event Profiling --------------- diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4ce728ca1b1..730e992d28d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -280,6 +281,7 @@ static struct trace_probe *alloc_trace_probe(const char *event, } else tp->rp.kp.addr = addr; + /* Set handler here for checking whether this probe is return or not. */ if (is_return) tp->rp.handler = kretprobe_trace_func; else @@ -929,10 +931,13 @@ static int probe_event_enable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; - if (probe_is_return(tp)) + if (probe_is_return(tp)) { + tp->rp.handler = kretprobe_trace_func; return enable_kretprobe(&tp->rp); - else + } else { + tp->rp.kp.pre_handler = kprobe_trace_func; return enable_kprobe(&tp->rp.kp); + } } static void probe_event_disable(struct ftrace_event_call *call) @@ -1105,6 +1110,101 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, "func, ret_ip"); } +#ifdef CONFIG_EVENT_PROFILE + +/* Kprobe profile handler */ +static __kprobes int kprobe_profile_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry *entry; + int size, i, pc; + unsigned long irq_flags; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + + do { + char raw_data[size]; + struct trace_entry *ent; + + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i], regs); + perf_tpcounter_event(call->id, entry->ip, 1, entry, size); + } while (0); + return 0; +} + +/* Kretprobe profile handler */ +static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry *entry; + int size, i, pc; + unsigned long irq_flags; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + + do { + char raw_data[size]; + struct trace_entry *ent; + + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kretprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i], regs); + perf_tpcounter_event(call->id, entry->ret_ip, 1, entry, size); + } while (0); + return 0; +} + +static int probe_profile_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + if (atomic_inc_return(&call->profile_count)) + return 0; + + if (probe_is_return(tp)) { + tp->rp.handler = kretprobe_profile_func; + return enable_kretprobe(&tp->rp); + } else { + tp->rp.kp.pre_handler = kprobe_profile_func; + return enable_kprobe(&tp->rp.kp); + } +} + +static void probe_profile_disable(struct ftrace_event_call *call) +{ + if (atomic_add_negative(-1, &call->profile_count)) + probe_event_disable(call); +} + +#endif /* CONFIG_EVENT_PROFILE */ + static int register_probe_event(struct trace_probe *tp) { struct ftrace_event_call *call = &tp->call; @@ -1130,6 +1230,12 @@ static int register_probe_event(struct trace_probe *tp) call->enabled = 1; call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; + +#ifdef CONFIG_EVENT_PROFILE + atomic_set(&call->profile_count, -1); + call->profile_enable = probe_profile_enable; + call->profile_disable = probe_profile_disable; +#endif call->data = tp; ret = trace_add_event_call(call); if (ret) { -- cgit v1.2.3-70-g09d2 From eca0d916f6429785bbc88db3ff66631cde62b432 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:38 -0400 Subject: tracing/kprobes: Add argument name support Add argument name assignment support and remove "alias" lines from format. This allows user to assign unique name to each argument. For example, $ echo p do_sys_open dfd=a0 filename=a1 flags=a2 mode=a3 > kprobe_events This assigns dfd, filename, flags, and mode to 1st - 4th arguments respectively. Trace buffer shows those names too. <...>-1439 [000] 1200885.933147: do_sys_open+0x0/0xdf: dfd=ffffff9c filename=bfa898ac flags=8000 mode=0 This helps users to know what each value means. Users can filter each events by these names too. Note that you can not filter by argN anymore. Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235337.22412.77383.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 46 ++++++------- kernel/trace/trace_kprobe.c | 128 ++++++++++++++++++------------------ 2 files changed, 84 insertions(+), 90 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 8f882ebd136..aaa6c1067c7 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -42,7 +42,8 @@ Synopsis of kprobe_events aN : Fetch function argument. (N >= 0)(*) rv : Fetch return value.(**) ra : Fetch return address.(**) - +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) + +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***) + NAME=FETCHARG: Set NAME as the argument name of FETCHARG. (*) aN may not correct on asmlinkaged functions and at the middle of function body. @@ -62,12 +63,10 @@ enabled: You can enable/disable the probe by writing 1 or 0 on it. format: - This shows the format of this probe event. It also shows aliases of arguments - which you specified to kprobe_events. + This shows the format of this probe event. filter: - You can write filtering rules of this event. And you can use both of aliase - names and field names for describing filters. + You can write filtering rules of this event. id: This shows the id of this probe event. @@ -85,10 +84,11 @@ Usage examples To add a probe as a new event, write a new definition to kprobe_events as below. - echo p:myprobe do_sys_open a0 a1 a2 a3 > /sys/kernel/debug/tracing/kprobe_events + echo p:myprobe do_sys_open dfd=a0 filename=a1 flags=a2 mode=a3 > /sys/kernel/debug/tracing/kprobe_events This sets a kprobe on the top of do_sys_open() function with recording -1st to 4th arguments as "myprobe" event. +1st to 4th arguments as "myprobe" event. As this example shows, users can +choose more familiar names for each arguments. echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events @@ -99,7 +99,7 @@ recording return value and return address as "myretprobe" event. cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format name: myprobe -ID: 23 +ID: 75 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; @@ -109,21 +109,15 @@ format: field: unsigned long ip; offset:16;tsize:8; field: int nargs; offset:24;tsize:4; - field: unsigned long arg0; offset:32;tsize:8; - field: unsigned long arg1; offset:40;tsize:8; - field: unsigned long arg2; offset:48;tsize:8; - field: unsigned long arg3; offset:56;tsize:8; + field: unsigned long dfd; offset:32;tsize:8; + field: unsigned long filename; offset:40;tsize:8; + field: unsigned long flags; offset:48;tsize:8; + field: unsigned long mode; offset:56;tsize:8; - alias: a0; original: arg0; - alias: a1; original: arg1; - alias: a2; original: arg2; - alias: a3; original: arg3; +print fmt: "%lx: dfd=%lx filename=%lx flags=%lx mode=%lx", ip, REC->dfd, REC->filename, REC->flags, REC->mode -print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3 - - You can see that the event has 4 arguments and alias expressions -corresponding to it. + You can see that the event has 4 arguments as in the expressions you specified. echo > /sys/kernel/debug/tracing/kprobe_events @@ -135,12 +129,12 @@ corresponding to it. # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | - <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 - <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a - <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 - <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a - <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 - <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a + <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: dfd=3 filename=7fffd1ec4440 flags=8000 mode=0 + <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: rv=fffffffffffffffe ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=40413c flags=8000 mode=1b6 + <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=4041c6 flags=98800 mode=10 + <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a Each line shows when the kernel hits a probe, and <- SYMBOL means kernel diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 730e992d28d..44dad1aa95d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -176,9 +176,14 @@ static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) } /** - * kprobe_trace_core + * Kprobe tracer core functions */ +struct probe_arg { + struct fetch_func fetch; + const char *name; +}; + struct trace_probe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ @@ -187,12 +192,12 @@ struct trace_probe { struct ftrace_event_call call; struct trace_event event; unsigned int nr_args; - struct fetch_func args[]; + struct probe_arg args[]; }; #define SIZEOF_TRACE_PROBE(n) \ (offsetof(struct trace_probe, args) + \ - (sizeof(struct fetch_func) * (n))) + (sizeof(struct probe_arg) * (n))) static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_trace_func(struct kretprobe_instance *ri, @@ -301,15 +306,21 @@ error: return ERR_PTR(-ENOMEM); } +static void free_probe_arg(struct probe_arg *arg) +{ + if (arg->fetch.func == fetch_symbol) + free_symbol_cache(arg->fetch.data); + else if (arg->fetch.func == fetch_indirect) + free_indirect_fetch_data(arg->fetch.data); + kfree(arg->name); +} + static void free_trace_probe(struct trace_probe *tp) { int i; for (i = 0; i < tp->nr_args; i++) - if (tp->args[i].func == fetch_symbol) - free_symbol_cache(tp->args[i].data); - else if (tp->args[i].func == fetch_indirect) - free_indirect_fetch_data(tp->args[i].data); + free_probe_arg(&tp->args[i]); kfree(tp->call.name); kfree(tp->symbol); @@ -532,11 +543,13 @@ static int create_trace_probe(int argc, char **argv) * %REG : fetch register REG * Indirect memory fetch: * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. */ struct trace_probe *tp; int i, ret = 0; int is_return = 0; - char *symbol = NULL, *event = NULL; + char *symbol = NULL, *event = NULL, *arg = NULL; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -596,12 +609,21 @@ static int create_trace_probe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - if (strlen(argv[i]) > MAX_ARGSTR_LEN) { - pr_info("Argument%d(%s) is too long.\n", i, argv[i]); + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) + *arg++ = '\0'; + else + arg = argv[i]; + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + + /* Parse fetch argument */ + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument%d(%s) is too long.\n", i, arg); ret = -ENOSPC; goto error; } - ret = parse_probe_arg(argv[i], &tp->args[i], is_return); + ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); if (ret) goto error; } @@ -664,12 +686,12 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, " 0x%p", tp->rp.kp.addr); for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); + ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); if (ret < 0) { pr_warning("Argument%d decoding error(%d).\n", i, ret); return ret; } - seq_printf(m, " %s", buf); + seq_printf(m, " %s=%s", tp->args[i].name, buf); } seq_printf(m, "\n"); return 0; @@ -824,7 +846,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i], regs); + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -858,7 +880,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i], regs); + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -872,9 +894,13 @@ print_kprobe_event(struct trace_iterator *iter, int flags) { struct kprobe_trace_entry *field; struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; int i; field = (struct kprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) goto partial; @@ -883,7 +909,8 @@ print_kprobe_event(struct trace_iterator *iter, int flags) goto partial; for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " 0x%lx", field->args[i])) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) goto partial; if (!trace_seq_puts(s, "\n")) @@ -899,9 +926,13 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) { struct kretprobe_trace_entry *field; struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; int i; field = (struct kretprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) goto partial; @@ -916,7 +947,8 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) goto partial; for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " 0x%lx", field->args[i])) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) goto partial; if (!trace_seq_puts(s, "\n")) @@ -972,7 +1004,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kprobe_trace_entry field; - char buf[MAX_ARGSTR_LEN + 1]; struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); @@ -981,16 +1012,9 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ip, "ip", 0); DEFINE_FIELD(int, nargs, "nargs", 1); - for (i = 0; i < tp->nr_args; i++) { - /* Set argN as a field */ - sprintf(buf, "arg%d", i); - DEFINE_FIELD(unsigned long, args[i], buf, 0); - /* Set argument string as an alias field */ - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); - if (ret < 0) - return ret; - DEFINE_FIELD(unsigned long, args[i], buf, 0); - } + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); return 0; } @@ -998,7 +1022,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry field; - char buf[MAX_ARGSTR_LEN + 1]; struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); @@ -1008,16 +1031,9 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, func, "func", 0); DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0); DEFINE_FIELD(int, nargs, "nargs", 1); - for (i = 0; i < tp->nr_args; i++) { - /* Set argN as a field */ - sprintf(buf, "arg%d", i); - DEFINE_FIELD(unsigned long, args[i], buf, 0); - /* Set argument string as an alias field */ - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); - if (ret < 0) - return ret; - DEFINE_FIELD(unsigned long, args[i], buf, 0); - } + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); return 0; } @@ -1025,31 +1041,21 @@ static int __probe_event_show_format(struct trace_seq *s, struct trace_probe *tp, const char *fmt, const char *arg) { - int i, ret; - char buf[MAX_ARGSTR_LEN + 1]; + int i; - /* Show aliases */ - for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); - if (ret < 0) - return ret; - if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n", - buf, i)) - return 0; - } /* Show format */ if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) return 0; for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_puts(s, " 0x%lx")) + if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) return 0; if (!trace_seq_printf(s, "\", %s", arg)) return 0; for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", arg%d", i)) + if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) return 0; return trace_seq_puts(s, "\n"); @@ -1071,17 +1077,14 @@ static int kprobe_event_show_format(struct ftrace_event_call *call, { struct kprobe_trace_entry field __attribute__((unused)); int ret, i; - char buf[8]; struct trace_probe *tp = (struct trace_probe *)call->data; SHOW_FIELD(unsigned long, ip, "ip"); SHOW_FIELD(int, nargs, "nargs"); /* Show fields */ - for (i = 0; i < tp->nr_args; i++) { - sprintf(buf, "arg%d", i); - SHOW_FIELD(unsigned long, args[i], buf); - } + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); trace_seq_puts(s, "\n"); return __probe_event_show_format(s, tp, "%lx:", "ip"); @@ -1092,7 +1095,6 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, { struct kretprobe_trace_entry field __attribute__((unused)); int ret, i; - char buf[8]; struct trace_probe *tp = (struct trace_probe *)call->data; SHOW_FIELD(unsigned long, func, "func"); @@ -1100,10 +1102,8 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, SHOW_FIELD(int, nargs, "nargs"); /* Show fields */ - for (i = 0; i < tp->nr_args; i++) { - sprintf(buf, "arg%d", i); - SHOW_FIELD(unsigned long, args[i], buf); - } + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); trace_seq_puts(s, "\n"); return __probe_event_show_format(s, tp, "%lx <- %lx:", @@ -1140,7 +1140,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i], regs); + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tpcounter_event(call->id, entry->ip, 1, entry, size); } while (0); return 0; @@ -1175,7 +1175,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i], regs); + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tpcounter_event(call->id, entry->ret_ip, 1, entry, size); } while (0); return 0; -- cgit v1.2.3-70-g09d2 From 6e9f23d1619f7badaf9090dac09e86a22d6061d8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:45 -0400 Subject: tracing/kprobes: Show event name in trace output Show event name in tracing/trace output. This also fixes kprobes events format to comply with other tracepoint events formats. Before patching: <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: ... <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: ... After patching: <...>-1447 [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) ... <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) ... Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235345.22412.76527.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 16 ++++++++-------- kernel/trace/trace_kprobe.c | 16 +++++++++++----- 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index aaa6c1067c7..a849889e609 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -114,7 +114,7 @@ format: field: unsigned long flags; offset:48;tsize:8; field: unsigned long mode; offset:56;tsize:8; -print fmt: "%lx: dfd=%lx filename=%lx flags=%lx mode=%lx", ip, REC->dfd, REC->filename, REC->flags, REC->mode +print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode You can see that the event has 4 arguments as in the expressions you specified. @@ -129,15 +129,15 @@ print fmt: "%lx: dfd=%lx filename=%lx flags=%lx mode=%lx", ip, REC->dfd, REC->fi # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | - <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: dfd=3 filename=7fffd1ec4440 flags=8000 mode=0 - <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: rv=fffffffffffffffe ra=ffffffff81367a3a - <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=40413c flags=8000 mode=1b6 - <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a - <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: dfd=ffffff9c filename=4041c6 flags=98800 mode=10 - <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: rv=3 ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0 + <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) rv=fffffffffffffffe ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6 + <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10 + <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a - Each line shows when the kernel hits a probe, and <- SYMBOL means kernel + Each line shows when the kernel hits an event, and <- SYMBOL means kernel returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel returns from do_sys_open to sys_open+0x1b). diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 44dad1aa95d..1746afeaabf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -902,10 +902,13 @@ print_kprobe_event(struct trace_iterator *iter, int flags) event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) goto partial; - if (!trace_seq_puts(s, ":")) + if (!trace_seq_puts(s, ")")) goto partial; for (i = 0; i < field->nargs; i++) @@ -934,6 +937,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) goto partial; @@ -943,7 +949,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) goto partial; - if (!trace_seq_puts(s, ":")) + if (!trace_seq_puts(s, ")")) goto partial; for (i = 0; i < field->nargs; i++) @@ -1087,7 +1093,7 @@ static int kprobe_event_show_format(struct ftrace_event_call *call, SHOW_FIELD(unsigned long, args[i], tp->args[i].name); trace_seq_puts(s, "\n"); - return __probe_event_show_format(s, tp, "%lx:", "ip"); + return __probe_event_show_format(s, tp, "(%lx)", "REC->ip"); } static int kretprobe_event_show_format(struct ftrace_event_call *call, @@ -1106,8 +1112,8 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, SHOW_FIELD(unsigned long, args[i], tp->args[i].name); trace_seq_puts(s, "\n"); - return __probe_event_show_format(s, tp, "%lx <- %lx:", - "func, ret_ip"); + return __probe_event_show_format(s, tp, "(%lx <- %lx)", + "REC->func, REC->ret_ip"); } #ifdef CONFIG_EVENT_PROFILE -- cgit v1.2.3-70-g09d2 From f52487e9c0041842eeb77c6c48774414b1cede08 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:53 -0400 Subject: tracing/kprobes: Support custom subsystem for each kprobe event Support specifying a custom subsystem(group) for each kprobe event. This allows users to create new group to control several probes at once, or add events to existing groups as additional tracepoints. New synopsis: p[:[subsys/]event-name] KADDR|KSYM[+offs] [ARGS] Signed-off-by: Masami Hiramatsu Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235353.22412.15149.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 5 +++-- kernel/trace/trace_kprobe.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index a849889e609..6521681e783 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -25,9 +25,10 @@ probe events via /sys/kernel/debug/tracing/events/kprobes//filter. Synopsis of kprobe_events ------------------------- - p[:EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + GRP : Group name. If omitted, use "kprobes" for it. EVENT : Event name. If omitted, the event name is generated based on SYMBOL+offs or MEMADDR. SYMBOL[+offs] : Symbol+offset where the probe is inserted. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1746afeaabf..cbc0870dcf5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -36,6 +36,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_EVENT_NAME_LEN 64 +#define KPROBE_EVENT_SYSTEM "kprobes" /* currently, trace_kprobe only supports X86. */ @@ -265,7 +266,8 @@ static LIST_HEAD(probe_list); /* * Allocate new trace_probe and initialize it (including kprobes). */ -static struct trace_probe *alloc_trace_probe(const char *event, +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, void *addr, const char *symbol, unsigned long offs, @@ -298,9 +300,16 @@ static struct trace_probe *alloc_trace_probe(const char *event, if (!tp->call.name) goto error; + if (!group) + goto error; + tp->call.system = kstrdup(group, GFP_KERNEL); + if (!tp->call.system) + goto error; + INIT_LIST_HEAD(&tp->list); return tp; error: + kfree(tp->call.name); kfree(tp->symbol); kfree(tp); return ERR_PTR(-ENOMEM); @@ -322,6 +331,7 @@ static void free_trace_probe(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) free_probe_arg(&tp->args[i]); + kfree(tp->call.system); kfree(tp->call.name); kfree(tp->symbol); kfree(tp); @@ -530,8 +540,8 @@ static int create_trace_probe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:EVENT] SYMBOL[+OFFS]|ADDRESS [FETCHARGS] - * - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS] + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] * Fetch args: * aN : fetch Nth of function argument. (N:0-) * rv : fetch return value @@ -549,7 +559,7 @@ static int create_trace_probe(int argc, char **argv) struct trace_probe *tp; int i, ret = 0; int is_return = 0; - char *symbol = NULL, *event = NULL, *arg = NULL; + char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -566,6 +576,15 @@ static int create_trace_probe(int argc, char **argv) if (argv[0][1] == ':') { event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specifiled\n"); + return -EINVAL; + } + } if (strlen(event) == 0) { pr_info("Event name is not specifiled\n"); return -EINVAL; @@ -592,6 +611,8 @@ static int create_trace_probe(int argc, char **argv) argc -= 2; argv += 2; /* setup a probe */ + if (!group) + group = KPROBE_EVENT_SYSTEM; if (!event) { /* Make a new event name */ if (symbol) @@ -602,7 +623,8 @@ static int create_trace_probe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tp = alloc_trace_probe(event, addr, symbol, offset, argc, is_return); + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); if (IS_ERR(tp)) return PTR_ERR(tp); @@ -1217,7 +1239,6 @@ static int register_probe_event(struct trace_probe *tp) int ret; /* Initialize ftrace_event_call */ - call->system = "kprobes"; if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; -- cgit v1.2.3-70-g09d2 From 5a0d9050db4d1147722b42afef9011251b2651ee Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 14 Sep 2009 16:49:37 -0400 Subject: tracing/kprobes: Disable kprobe events by default after creation Disable newly created kprobe events by default, not to disturb another user using ftrace. "Disturb" means when someone is using ftrace and another user tries to use perf-tools, (in near future) if he defines new kprobe event via perf-tools, then new events will mess up the frace buffer. Fix this to allow proper and transparent kprobes events concurrent usage between ftrace users and perf users. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Tom Zanussi LKML-Reference: <20090914204937.18779.59422.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 11 +++++++++-- kernel/trace/trace_kprobe.c | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 6521681e783..9b8f7c6040a 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -122,8 +122,15 @@ print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, R echo > /sys/kernel/debug/tracing/kprobe_events - This clears all probe points. and you can see the traced information via -/sys/kernel/debug/tracing/trace. + This clears all probe points. + + Right after definition, each event is disabled by default. For tracing these +events, you need to enable it. + + echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable + echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable + + And you can see the traced information via /sys/kernel/debug/tracing/trace. cat /sys/kernel/debug/tracing/trace # tracer: nop diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d8db9357489..f6821f16227 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -383,7 +383,7 @@ static int register_trace_probe(struct trace_probe *tp) goto end; } - tp->flags = TP_FLAG_TRACE; + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; if (probe_is_return(tp)) ret = register_kretprobe(&tp->rp); else @@ -1298,7 +1298,7 @@ static int register_probe_event(struct trace_probe *tp) call->id = register_ftrace_event(&tp->event); if (!call->id) return -ENODEV; - call->enabled = 1; + call->enabled = 0; call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; -- cgit v1.2.3-70-g09d2 From 405b2651e4bedf8d3932b64cad649b4d26b067f5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 7 Oct 2009 18:27:40 -0400 Subject: tracing/kprobes: Add $ prefix to special variables Add $ prefix to the special variables(e.g. sa, rv) of kprobe-tracer. This resolves consistency issues between kprobe_events and perf-kprobe. The main goal is to avoid conflicts between local variable names of probed functions, used by perf probe, and special variables used in the kprobe event creation interface (stack values, etc...) and also available from perf probe. ie: we don't want rv (return value) to conflict with a local variable named rv in a probed function. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Frank Ch. Eigler LKML-Reference: <20091007222740.1684.91170.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 20 ++++++------- kernel/trace/trace_kprobe.c | 60 +++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 33 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 9b8f7c6040a..33f531858c5 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -36,13 +36,13 @@ Synopsis of kprobe_events FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG - sN : Fetch Nth entry of stack (N >= 0) - sa : Fetch stack address. @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) - aN : Fetch function argument. (N >= 0)(*) - rv : Fetch return value.(**) - ra : Fetch return address.(**) + $sN : Fetch Nth entry of stack (N >= 0) + $sa : Fetch stack address. + $aN : Fetch function argument. (N >= 0)(*) + $rv : Fetch return value.(**) + $ra : Fetch return address.(**) +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***) NAME=FETCHARG: Set NAME as the argument name of FETCHARG. @@ -85,13 +85,13 @@ Usage examples To add a probe as a new event, write a new definition to kprobe_events as below. - echo p:myprobe do_sys_open dfd=a0 filename=a1 flags=a2 mode=a3 > /sys/kernel/debug/tracing/kprobe_events + echo p:myprobe do_sys_open dfd=$a0 filename=$a1 flags=$a2 mode=$a3 > /sys/kernel/debug/tracing/kprobe_events This sets a kprobe on the top of do_sys_open() function with recording 1st to 4th arguments as "myprobe" event. As this example shows, users can choose more familiar names for each arguments. - echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events + echo r:myretprobe do_sys_open $rv $ra >> /sys/kernel/debug/tracing/kprobe_events This sets a kretprobe on the return point of do_sys_open() function with recording return value and return address as "myretprobe" event. @@ -138,11 +138,11 @@ events, you need to enable it. # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-1447 [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0 - <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) rv=fffffffffffffffe ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe $ra=ffffffff81367a3a <...>-1447 [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6 - <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a <...>-1447 [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10 - <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) rv=3 ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a Each line shows when the kernel hits an event, and <- SYMBOL means kernel diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 97309d4714f..f63ead0cc5c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -220,24 +220,24 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) int ret = -EINVAL; if (ff->func == fetch_argument) - ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data); + ret = snprintf(buf, n, "$a%lu", (unsigned long)ff->data); else if (ff->func == fetch_register) { const char *name; name = regs_query_register_name((unsigned int)((long)ff->data)); ret = snprintf(buf, n, "%%%s", name); } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data); + ret = snprintf(buf, n, "$s%lu", (unsigned long)ff->data); else if (ff->func == fetch_memory) ret = snprintf(buf, n, "@0x%p", ff->data); else if (ff->func == fetch_symbol) { struct symbol_cache *sc = ff->data; ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "rv"); + ret = snprintf(buf, n, "$rv"); else if (ff->func == fetch_ip) - ret = snprintf(buf, n, "ra"); + ret = snprintf(buf, n, "$ra"); else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "sa"); + ret = snprintf(buf, n, "$sa"); else if (ff->func == fetch_indirect) { struct indirect_fetch_data *id = ff->data; size_t l = 0; @@ -429,12 +429,10 @@ static int split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_ARGS 16 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) { int ret = 0; unsigned long param; - long offset; - char *tmp; switch (arg[0]) { case 'a': /* argument */ @@ -456,14 +454,6 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) } else ret = -EINVAL; break; - case '%': /* named register */ - ret = regs_query_register_offset(arg + 1); - if (ret >= 0) { - ff->func = fetch_register; - ff->data = (void *)(unsigned long)ret; - ret = 0; - } - break; case 's': /* stack */ if (arg[1] == 'a') { ff->func = fetch_stack_address; @@ -478,6 +468,31 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) } } break; + default: + ret = -EINVAL; + } + return ret; +} + +static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, ff, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; case '@': /* memory or symbol */ if (isdigit(arg[1])) { ret = strict_strtoul(arg + 1, 0, ¶m); @@ -489,8 +504,7 @@ static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) ret = split_symbol_offset(arg + 1, &offset); if (ret) break; - ff->data = alloc_symbol_cache(arg + 1, - offset); + ff->data = alloc_symbol_cache(arg + 1, offset); if (ff->data) ff->func = fetch_symbol; else @@ -544,11 +558,11 @@ static int create_trace_probe(int argc, char **argv) * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] * Fetch args: - * aN : fetch Nth of function argument. (N:0-) - * rv : fetch return value - * ra : fetch return address - * sa : fetch stack address - * sN : fetch Nth of stack (N:0-) + * $aN : fetch Nth of function argument. (N:0-) + * $rv : fetch return value + * $ra : fetch return address + * $sa : fetch stack address + * $sN : fetch Nth of stack (N:0-) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG -- cgit v1.2.3-70-g09d2 From 99329c44f28a1b7ac83beebfb4319e612042e319 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 7 Oct 2009 18:27:48 -0400 Subject: tracing/kprobes: Remove '$ra' special variable Remove '$ra' (return address) because it is already shown at the head of each entry. Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Frank Ch. Eigler LKML-Reference: <20091007222748.1684.12711.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 13 ++++++------- kernel/trace/trace_kprobe.c | 11 ----------- 2 files changed, 6 insertions(+), 18 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 33f531858c5..4208253b5a5 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -1,4 +1,4 @@ - Kprobe-based Event Tracer + Kprobe-based Event Tracer ========================= Documentation is written by Masami Hiramatsu @@ -42,7 +42,6 @@ Synopsis of kprobe_events $sa : Fetch stack address. $aN : Fetch function argument. (N >= 0)(*) $rv : Fetch return value.(**) - $ra : Fetch return address.(**) +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***) NAME=FETCHARG: Set NAME as the argument name of FETCHARG. @@ -91,10 +90,10 @@ as below. 1st to 4th arguments as "myprobe" event. As this example shows, users can choose more familiar names for each arguments. - echo r:myretprobe do_sys_open $rv $ra >> /sys/kernel/debug/tracing/kprobe_events + echo r:myretprobe do_sys_open $rv >> /sys/kernel/debug/tracing/kprobe_events This sets a kretprobe on the return point of do_sys_open() function with -recording return value and return address as "myretprobe" event. +recording return value as "myretprobe" event. You can see the format of these events via /sys/kernel/debug/tracing/events/kprobes//format. @@ -138,11 +137,11 @@ events, you need to enable it. # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-1447 [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0 - <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe $ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe <...>-1447 [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6 - <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 <...>-1447 [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10 - <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 $ra=ffffffff81367a3a + <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 Each line shows when the kernel hits an event, and <- SYMBOL means kernel diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f63ead0cc5c..ba6d3bd4888 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -85,11 +85,6 @@ static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, return regs_return_value(regs); } -static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy) -{ - return instruction_pointer(regs); -} - static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, void *dummy) { @@ -234,8 +229,6 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); } else if (ff->func == fetch_retvalue) ret = snprintf(buf, n, "$rv"); - else if (ff->func == fetch_ip) - ret = snprintf(buf, n, "$ra"); else if (ff->func == fetch_stack_address) ret = snprintf(buf, n, "$sa"); else if (ff->func == fetch_indirect) { @@ -448,9 +441,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) if (is_return && arg[1] == 'v') { ff->func = fetch_retvalue; ff->data = NULL; - } else if (is_return && arg[1] == 'a') { - ff->func = fetch_ip; - ff->data = NULL; } else ret = -EINVAL; break; @@ -560,7 +550,6 @@ static int create_trace_probe(int argc, char **argv) * Fetch args: * $aN : fetch Nth of function argument. (N:0-) * $rv : fetch return value - * $ra : fetch return address * $sa : fetch stack address * $sN : fetch Nth of stack (N:0-) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) -- cgit v1.2.3-70-g09d2 From 369bc18f9a6c4e2686204c1d7476ab684a720968 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Mon, 12 Oct 2009 22:17:21 +0200 Subject: ftrace: add kernel command line graph function filtering Add a command line parameter to allow limiting the function graphs that are traced on boot up from the given top-level callers , when ftrace=function_graph is specified. This patch adds the following command line option: ftrace_graph_filter=function-list Where function-list is a comma separated list of functions to filter. [fweisbec@gmail.com: picked the documentation changes from the v2 patch] Signed-off-by: Stefan Assmann Acked-by: Steven Rostedt LKML-Reference: <4AD2DEB9.2@redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 7 +++++++ kernel/trace/ftrace.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6fa7292947e..1dc4b9cc20e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -778,6 +778,13 @@ and is between 256 and 4096 characters. It is defined in the file by the set_ftrace_notrace file in the debugfs tracing directory. + ftrace_graph_filter=[function-list] + [FTRACE] Limit the top level callers functions traced + by the function graph tracer at boot up. + function-list is a comma separated list of functions + that can be changed at run time by the + set_graph_function file in the debugfs tracing directory. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9a72853a8f0..91283d40821 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -78,6 +78,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -2248,6 +2252,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { @@ -2263,6 +2268,31 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int __init set_graph_function(char *str) +{ + strncpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2279,6 +2309,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int -- cgit v1.2.3-70-g09d2 From 2e06ff6389aedafc4a3a374344ac70672252f9b5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 7 Oct 2009 18:27:59 -0400 Subject: tracing/kprobes: Make special variable names more self-explainable Rename special variables to more self-explainable names as below: - $rv to $retval - $sa to $stack - $aN to $argN - $sN to $stackN Signed-off-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Frank Ch. Eigler LKML-Reference: <20091007222759.1684.3319.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/trace/kprobetrace.txt | 22 ++++++++-------- kernel/trace/trace_kprobe.c | 52 +++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 39 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 4208253b5a5..15415243a9a 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -35,13 +35,13 @@ Synopsis of kprobe_events MEMADDR : Address where the probe is inserted. FETCHARGS : Arguments. Each probe can have up to 128 args. - %REG : Fetch register REG - @ADDR : Fetch memory at ADDR (ADDR should be in kernel) + %REG : Fetch register REG + @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) - $sN : Fetch Nth entry of stack (N >= 0) - $sa : Fetch stack address. - $aN : Fetch function argument. (N >= 0)(*) - $rv : Fetch return value.(**) + $stackN : Fetch Nth entry of stack (N >= 0) + $stack : Fetch stack address. + $argN : Fetch function argument. (N >= 0)(*) + $retval : Fetch return value.(**) +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***) NAME=FETCHARG: Set NAME as the argument name of FETCHARG. @@ -84,13 +84,13 @@ Usage examples To add a probe as a new event, write a new definition to kprobe_events as below. - echo p:myprobe do_sys_open dfd=$a0 filename=$a1 flags=$a2 mode=$a3 > /sys/kernel/debug/tracing/kprobe_events + echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events This sets a kprobe on the top of do_sys_open() function with recording 1st to 4th arguments as "myprobe" event. As this example shows, users can choose more familiar names for each arguments. - echo r:myretprobe do_sys_open $rv >> /sys/kernel/debug/tracing/kprobe_events + echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events This sets a kretprobe on the return point of do_sys_open() function with recording return value as "myretprobe" event. @@ -137,11 +137,11 @@ events, you need to enable it. # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-1447 [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0 - <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $rv=fffffffffffffffe + <...>-1447 [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $retval=fffffffffffffffe <...>-1447 [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6 - <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 + <...>-1447 [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3 <...>-1447 [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10 - <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $rv=3 + <...>-1447 [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3 Each line shows when the kernel hits an event, and <- SYMBOL means kernel diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ba6d3bd4888..3313fa74ce5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -215,22 +215,22 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) int ret = -EINVAL; if (ff->func == fetch_argument) - ret = snprintf(buf, n, "$a%lu", (unsigned long)ff->data); + ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); else if (ff->func == fetch_register) { const char *name; name = regs_query_register_name((unsigned int)((long)ff->data)); ret = snprintf(buf, n, "%%%s", name); } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "$s%lu", (unsigned long)ff->data); + ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); else if (ff->func == fetch_memory) ret = snprintf(buf, n, "@0x%p", ff->data); else if (ff->func == fetch_symbol) { struct symbol_cache *sc = ff->data; ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "$rv"); + ret = snprintf(buf, n, "$retval"); else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "$sa"); + ret = snprintf(buf, n, "$stack"); else if (ff->func == fetch_indirect) { struct indirect_fetch_data *id = ff->data; size_t l = 0; @@ -427,40 +427,36 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) int ret = 0; unsigned long param; - switch (arg[0]) { - case 'a': /* argument */ - ret = strict_strtoul(arg + 1, 10, ¶m); - if (ret || param > PARAM_MAX_ARGS) - ret = -EINVAL; - else { - ff->func = fetch_argument; - ff->data = (void *)param; - } - break; - case 'r': /* retval or retaddr */ - if (is_return && arg[1] == 'v') { + if (strcmp(arg, "retval") == 0) { + if (is_return) { ff->func = fetch_retvalue; ff->data = NULL; } else ret = -EINVAL; - break; - case 's': /* stack */ - if (arg[1] == 'a') { + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { ff->func = fetch_stack_address; ff->data = NULL; - } else { - ret = strict_strtoul(arg + 1, 10, ¶m); + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { ff->func = fetch_stack; ff->data = (void *)param; } + } else + ret = -EINVAL; + } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { + ret = strict_strtoul(arg + 3, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; } - break; - default: + } else ret = -EINVAL; - } return ret; } @@ -548,10 +544,10 @@ static int create_trace_probe(int argc, char **argv) * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] * Fetch args: - * $aN : fetch Nth of function argument. (N:0-) - * $rv : fetch return value - * $sa : fetch stack address - * $sN : fetch Nth of stack (N:0-) + * $argN : fetch Nth of function argument. (N:0-) + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG -- cgit v1.2.3-70-g09d2 From 459c6d15a0c52bae43842ff2cd0dd41aa7de9b7f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 19 Sep 2009 07:14:15 +0200 Subject: tracing: Document HAVE_SYSCALL_TRACEPOINTS needs Document the arch needed requirements to get the support for syscalls tracing. v2: HAVE_FTRACE_SYSCALLS have been changed to HAVE_SYSCALL_TRACEPOINTS recently. Update this config name in the documentation then. Signed-off-by: Frederic Weisbecker Acked-by: Heiko Carstens Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Paul Mundt --- Documentation/trace/ftrace-design.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 7003e10f10f..641a1ef2a7f 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -213,10 +213,19 @@ If you can't trace NMI functions, then skip this option.
-HAVE_FTRACE_SYSCALLS +HAVE_SYSCALL_TRACEPOINTS --------------------- -
+You need very few things to get the syscalls tracing in an arch. + +- Have a NR_syscalls variable in that provides the number + of syscalls supported by the arch. +- Implement arch_syscall_addr() that resolves a syscall address from a + syscall number. +- Support the TIF_SYSCALL_TRACEPOINT thread flags +- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace + in the ptrace syscalls tracing path. +- Tag this arch as HAVE_SYSCALL_TRACEPOINTS. HAVE_FTRACE_MCOUNT_RECORD -- cgit v1.2.3-70-g09d2 From 9636bc0555e3f383c120ddcffe4b7c5c58a10b1a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 19:09:04 +0400 Subject: x86, apic: Explain show_lapic= in kernel parameters list Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091014150904.GA5259@lenovo> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9107b387e91..465a786a378 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -345,6 +345,15 @@ and is between 256 and 4096 characters. It is defined in the file Change the amount of debugging information output when initialising the APIC and IO-APIC components. + show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller + Limit apic dumping. The parameter defines the maximal + number of local apics being dumped. Also it is possible + to set it to "all" by meaning -- no limit here. + Format: { 1 (default) | 2 | ... | all }. + The parameter valid if only apic=debug or + apic=verbose is specified. + Example: apic=debug show_lapic=all + apm= [APM] Advanced Power Management See header of arch/x86/kernel/apm_32.c. -- cgit v1.2.3-70-g09d2 From bd58b430039435e4c981cf802b5b11d511d73abd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:54 -0700 Subject: rcu: Update trace.txt documentation to reflect recent changes o Remove the CONFIG_PREEMPT_RCU documentation since this config option has now been removed. o Change the now-incorrect references to "rcu" labels to instead be "rcu_sched". o Add notes stating that CONFIG_TREE_PREEMPT_RCU kernels will have additional "rcu_preempt" output. o Note the new "oqlen" field in the rcuhier output (for RCU callbacks orphaned by an offlined CPU). Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: npiggin@suse.de Cc: jens.axboe@oracle.com LKML-Reference: <1255540559799-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/trace.txt | 231 +++++++------------------------------------- 1 file changed, 33 insertions(+), 198 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 187bbf10c92..c1a95507d10 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -1,185 +1,10 @@ CONFIG_RCU_TRACE debugfs Files and Formats -The rcupreempt and rcutree implementations of RCU provide debugfs trace -output that summarizes counters and state. This information is useful for -debugging RCU itself, and can sometimes also help to debug abuses of RCU. -Note that the rcuclassic implementation of RCU does not provide debugfs -trace output. - -The following sections describe the debugfs files and formats for -preemptable RCU (rcupreempt) and hierarchical RCU (rcutree). - - -Preemptable RCU debugfs Files and Formats - -This implementation of RCU provides three debugfs files under the -top-level directory RCU: rcu/rcuctrs (which displays the per-CPU -counters used by preemptable RCU) rcu/rcugp (which displays grace-period -counters), and rcu/rcustats (which internal counters for debugging RCU). - -The output of "cat rcu/rcuctrs" looks as follows: - -CPU last cur F M - 0 5 -5 0 0 - 1 -1 0 0 0 - 2 0 1 0 0 - 3 0 1 0 0 - 4 0 1 0 0 - 5 0 1 0 0 - 6 0 2 0 0 - 7 0 -1 0 0 - 8 0 1 0 0 -ggp = 26226, state = waitzero - -The per-CPU fields are as follows: - -o "CPU" gives the CPU number. Offline CPUs are not displayed. - -o "last" gives the value of the counter that is being decremented - for the current grace period phase. In the example above, - the counters sum to 4, indicating that there are still four - RCU read-side critical sections still running that started - before the last counter flip. - -o "cur" gives the value of the counter that is currently being - both incremented (by rcu_read_lock()) and decremented (by - rcu_read_unlock()). In the example above, the counters sum to - 1, indicating that there is only one RCU read-side critical section - still running that started after the last counter flip. - -o "F" indicates whether RCU is waiting for this CPU to acknowledge - a counter flip. In the above example, RCU is not waiting on any, - which is consistent with the state being "waitzero" rather than - "waitack". - -o "M" indicates whether RCU is waiting for this CPU to execute a - memory barrier. In the above example, RCU is not waiting on any, - which is consistent with the state being "waitzero" rather than - "waitmb". - -o "ggp" is the global grace-period counter. - -o "state" is the RCU state, which can be one of the following: - - o "idle": there is no grace period in progress. - - o "waitack": RCU just incremented the global grace-period - counter, which has the effect of reversing the roles of - the "last" and "cur" counters above, and is waiting for - all the CPUs to acknowledge the flip. Once the flip has - been acknowledged, CPUs will no longer be incrementing - what are now the "last" counters, so that their sum will - decrease monotonically down to zero. - - o "waitzero": RCU is waiting for the sum of the "last" counters - to decrease to zero. - - o "waitmb": RCU is waiting for each CPU to execute a memory - barrier, which ensures that instructions from a given CPU's - last RCU read-side critical section cannot be reordered - with instructions following the memory-barrier instruction. - -The output of "cat rcu/rcugp" looks as follows: - -oldggp=48870 newggp=48873 - -Note that reading from this file provokes a synchronize_rcu(). The -"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before -executing the synchronize_rcu(), and the "newggp" value is also the -"ggp" value, but taken after the synchronize_rcu() command returns. - - -The output of "cat rcu/rcugp" looks as follows: - -na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871 -1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640 -z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639 - -These are counters tracking internal preemptable-RCU events, however, -some of them may be useful for debugging algorithms using RCU. In -particular, the "nl", "wl", and "dl" values track the number of RCU -callbacks in various states. The fields are as follows: - -o "na" is the total number of RCU callbacks that have been enqueued - since boot. - -o "nl" is the number of RCU callbacks waiting for the previous - grace period to end so that they can start waiting on the next - grace period. - -o "wa" is the total number of RCU callbacks that have started waiting - for a grace period since boot. "na" should be roughly equal to - "nl" plus "wa". - -o "wl" is the number of RCU callbacks currently waiting for their - grace period to end. - -o "da" is the total number of RCU callbacks whose grace periods - have completed since boot. "wa" should be roughly equal to - "wl" plus "da". - -o "dr" is the total number of RCU callbacks that have been removed - from the list of callbacks ready to invoke. "dr" should be roughly - equal to "da". - -o "di" is the total number of RCU callbacks that have been invoked - since boot. "di" should be roughly equal to "da", though some - early versions of preemptable RCU had a bug so that only the - last CPU's count of invocations was displayed, rather than the - sum of all CPU's counts. - -o "1" is the number of calls to rcu_try_flip(). This should be - roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1" - described below. In other words, the number of times that - the state machine is visited should be equal to the sum of the - number of times that each state is visited plus the number of - times that the state-machine lock acquisition failed. - -o "e1" is the number of times that rcu_try_flip() was unable to - acquire the fliplock. - -o "i1" is the number of calls to rcu_try_flip_idle(). - -o "ie1" is the number of times rcu_try_flip_idle() exited early - due to the calling CPU having no work for RCU. - -o "g1" is the number of times that rcu_try_flip_idle() decided - to start a new grace period. "i1" should be roughly equal to - "ie1" plus "g1". - -o "a1" is the number of calls to rcu_try_flip_waitack(). - -o "ae1" is the number of times that rcu_try_flip_waitack() found - that at least one CPU had not yet acknowledge the new grace period - (AKA "counter flip"). - -o "a2" is the number of time rcu_try_flip_waitack() found that - all CPUs had acknowledged. "a1" should be roughly equal to - "ae1" plus "a2". (This particular output was collected on - a 128-CPU machine, hence the smaller-than-usual fraction of - calls to rcu_try_flip_waitack() finding all CPUs having already - acknowledged.) - -o "z1" is the number of calls to rcu_try_flip_waitzero(). - -o "ze1" is the number of times that rcu_try_flip_waitzero() found - that not all of the old RCU read-side critical sections had - completed. - -o "z2" is the number of times that rcu_try_flip_waitzero() finds - the sum of the counters equal to zero, in other words, that - all of the old RCU read-side critical sections had completed. - The value of "z1" should be roughly equal to "ze1" plus - "z2". - -o "m1" is the number of calls to rcu_try_flip_waitmb(). - -o "me1" is the number of times that rcu_try_flip_waitmb() finds - that at least one CPU has not yet executed a memory barrier. - -o "m2" is the number of times that rcu_try_flip_waitmb() finds that - all CPUs have executed a memory barrier. +The rcutree implementation of RCU provides debugfs trace output that +summarizes counters and state. This information is useful for debugging +RCU itself, and can sometimes also help to debug abuses of RCU. +The following sections describe the debugfs files and formats. Hierarchical RCU debugfs Files and Formats @@ -210,9 +35,10 @@ rcu_bh: 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 -The first section lists the rcu_data structures for rcu, the second for -rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. -The fields are as follows: +The first section lists the rcu_data structures for rcu_sched, the second +for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an +additional section for rcu_preempt. Each section has one line per CPU, +or eight for this 8-CPU system. The fields are as follows: o The number at the beginning of each line is the CPU number. CPUs numbers followed by an exclamation mark are offline, @@ -223,9 +49,9 @@ o The number at the beginning of each line is the CPU number. o "c" is the count of grace periods that this CPU believes have completed. CPUs in dynticks idle mode may lag quite a ways - behind, for example, CPU 4 under "rcu" above, which has slept - through the past 25 RCU grace periods. It is not unusual to - see CPUs lagging by thousands of grace periods. + behind, for example, CPU 4 under "rcu_sched" above, which has + slept through the past 25 RCU grace periods. It is not unusual + to see CPUs lagging by thousands of grace periods. o "g" is the count of grace periods that this CPU believes have started. Again, CPUs in dynticks idle mode may lag behind. @@ -308,8 +134,10 @@ The output of "cat rcu/rcugp" looks as follows: rcu_sched: completed=33062 gpnum=33063 rcu_bh: completed=464 gpnum=464 -Again, this output is for both "rcu" and "rcu_bh". The fields are -taken from the rcu_state structure, and are as follows: +Again, this output is for both "rcu_sched" and "rcu_bh". Note that +kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional +"rcu_preempt" line. The fields are taken from the rcu_state structure, +and are as follows: o "completed" is the number of grace periods that have completed. It is comparable to the "c" field from rcu/rcudata in that a @@ -324,23 +152,24 @@ o "gpnum" is the number of grace periods that have started. It is If these two fields are equal (as they are for "rcu_bh" above), then there is no grace period in progress, in other words, RCU is idle. On the other hand, if the two fields differ (as they - do for "rcu" above), then an RCU grace period is in progress. + do for "rcu_sched" above), then an RCU grace period is in progress. The output of "cat rcu/rcuhier" looks as follows, with very long lines: -c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 1/1 0:127 ^0 3/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 3/3f 0:5 ^0 2/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 rcu_bh: -c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 0/1 0:127 ^0 0/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 0/3f 0:5 ^0 0/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 -This is once again split into "rcu" and "rcu_bh" portions. The fields are -as follows: +This is once again split into "rcu_sched" and "rcu_bh" portions, +and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional +"rcu_preempt" section. The fields are as follows: o "c" is exactly the same as "completed" under rcu/rcugp. @@ -372,6 +201,11 @@ o "fqlh" is the number of calls to force_quiescent_state() that exited immediately (without even being counted in nfqs above) due to contention on ->fqslock. +o "oqlen" is the number of callbacks on the "orphan" callback + list. RCU callbacks are placed on this list by CPUs going + offline, and are "adopted" either by the CPU helping the outgoing + CPU or by the next rcu_barrier*() call, whichever comes first. + o Each element of the form "1/1 0:127 ^0" represents one struct rcu_node. Each line represents one level of the hierarchy, from root to leaves. It is best to think of the rcu_data structures @@ -389,10 +223,10 @@ o Each element of the form "1/1 0:127 ^0" represents one struct The value of qsmaskinit is assigned to that of qsmask at the beginning of each grace period. - For example, for "rcu", the qsmask of the first entry - of the lowest level is 0x14, meaning that we are still - waiting for CPUs 2 and 4 to check in for the current - grace period. + For example, for "rcu_sched", the qsmask of the first + entry of the lowest level is 0x14, meaning that we + are still waiting for CPUs 2 and 4 to check in for the + current grace period. o The numbers separated by the ":" are the range of CPUs served by this struct rcu_node. This can be helpful @@ -431,8 +265,9 @@ rcu_bh: 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 -As always, this is once again split into "rcu" and "rcu_bh" portions. -The fields are as follows: +As always, this is once again split into "rcu_sched" and "rcu_bh" +portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional +"rcu_preempt" section. The fields are as follows: o "np" is the number of times that __rcu_pending() has been invoked for the corresponding flavor of RCU. -- cgit v1.2.3-70-g09d2 From 0edf1a683e499191b27a067956ae9f5fa6e046c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:59 -0700 Subject: rcu: Update trace.txt documentation for blocked-tasks lists Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: npiggin@suse.de Cc: jens.axboe@oracle.com LKML-Reference: <12555405592804-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/trace.txt | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index c1a95507d10..8608fd85e92 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -158,14 +158,14 @@ o "gpnum" is the number of grace periods that have started. It is The output of "cat rcu/rcuhier" looks as follows, with very long lines: c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 -1/1 0:127 ^0 -3/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 -3/3f 0:5 ^0 2/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 +1/1 .>. 0:127 ^0 +3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 +3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 rcu_bh: c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 -0/1 0:127 ^0 -0/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 -0/3f 0:5 ^0 0/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 +0/1 .>. 0:127 ^0 +0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 +0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 This is once again split into "rcu_sched" and "rcu_bh" portions, and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional @@ -213,7 +213,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct might be either one, two, or three levels of rcu_node structures, depending on the relationship between CONFIG_RCU_FANOUT and CONFIG_NR_CPUS. - + o The numbers separated by the "/" are the qsmask followed by the qsmaskinit. The qsmask will have one bit set for each entity in the next lower level that @@ -228,6 +228,15 @@ o Each element of the form "1/1 0:127 ^0" represents one struct are still waiting for CPUs 2 and 4 to check in for the current grace period. + o The characters separated by the ">" indicate the state + of the blocked-tasks lists. A "T" preceding the ">" + indicates that at least one task blocked in an RCU + read-side critical section blocks the current grace + period, while a "." preceding the ">" indicates otherwise. + The character following the ">" indicates similarly for + the next grace period. A "T" should appear in this + field only for rcu-preempt. + o The numbers separated by the ":" are the range of CPUs served by this struct rcu_node. This can be helpful in working out how the hierarchy is wired together. -- cgit v1.2.3-70-g09d2 From 3e1c2515acf70448cad1ae3ab835ca80be043d33 Mon Sep 17 00:00:00 2001 From: James Morris Date: Tue, 20 Oct 2009 13:48:33 +0900 Subject: security: remove root_plug Remove the root_plug example LSM code. It's unmaintained and increasingly broken in various ways. Made at the 2009 Kernel Summit in Tokyo! Acked-by: Greg Kroah-Hartman Signed-off-by: James Morris --- Documentation/kernel-parameters.txt | 10 ----- security/Kconfig | 13 ------ security/Makefile | 1 - security/commoncap.c | 2 +- security/root_plug.c | 90 ------------------------------------- 5 files changed, 1 insertion(+), 115 deletions(-) delete mode 100644 security/root_plug.c (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6fa7292947e..5d386b4ff6a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -85,7 +85,6 @@ parameter is applicable: PPT Parallel port support is enabled. PS2 Appropriate PS/2 support is enabled. RAM RAM disk support is enabled. - ROOTPLUG The example Root Plug LSM is enabled. S390 S390 architecture is enabled. SCSI Appropriate SCSI support is enabled. A lot of drivers has their options described inside of @@ -2163,15 +2162,6 @@ and is between 256 and 4096 characters. It is defined in the file Useful for devices that are detected asynchronously (e.g. USB and MMC devices). - root_plug.vendor_id= - [ROOTPLUG] Override the default vendor ID - - root_plug.product_id= - [ROOTPLUG] Override the default product ID - - root_plug.debug= - [ROOTPLUG] Enable debugging output - rw [KNL] Mount root device read-write on boot S [KNL] Run init in single mode diff --git a/security/Kconfig b/security/Kconfig index fb363cd81cf..aeea8c2bb59 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -100,19 +100,6 @@ config SECURITY_FILE_CAPABILITIES If in doubt, answer N. -config SECURITY_ROOTPLUG - bool "Root Plug Support" - depends on USB=y && SECURITY - help - This is a sample LSM module that should only be used as such. - It prevents any programs running with egid == 0 if a specific - USB device is not present in the system. - - See for - more information about this module. - - If you are unsure how to answer this question, answer N. - config INTEL_TXT bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)" depends on HAVE_INTEL_TXT diff --git a/security/Makefile b/security/Makefile index 95ecc06392d..bb44e350c61 100644 --- a/security/Makefile +++ b/security/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_SMACK) += smack/built-in.o obj-$(CONFIG_AUDIT) += lsm_audit.o obj-$(CONFIG_SECURITY_TOMOYO) += tomoyo/built-in.o -obj-$(CONFIG_SECURITY_ROOTPLUG) += root_plug.o obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o # Object integrity file lists diff --git a/security/commoncap.c b/security/commoncap.c index fe30751a6cd..45b87af4ae5 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1,4 +1,4 @@ -/* Common capabilities, needed by capability.o and root_plug.o +/* Common capabilities, needed by capability.o. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/security/root_plug.c b/security/root_plug.c deleted file mode 100644 index 2f7ffa67c4d..00000000000 --- a/security/root_plug.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Root Plug sample LSM module - * - * Originally written for a Linux Journal. - * - * Copyright (C) 2002 Greg Kroah-Hartman - * - * Prevents any programs running with egid == 0 if a specific USB device - * is not present in the system. Yes, it can be gotten around, but is a - * nice starting point for people to play with, and learn the LSM - * interface. - * - * If you want to turn this into something with a semblance of security, - * you need to hook the task_* functions also. - * - * See http://www.linuxjournal.com/article.php?sid=6279 for more information - * about this code. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include -#include - -/* default is a generic type of usb to serial converter */ -static int vendor_id = 0x0557; -static int product_id = 0x2008; - -module_param(vendor_id, uint, 0400); -module_param(product_id, uint, 0400); - -/* should we print out debug messages */ -static int debug = 0; - -module_param(debug, bool, 0600); - -#define MY_NAME "root_plug" - -#define root_dbg(fmt, arg...) \ - do { \ - if (debug) \ - printk(KERN_DEBUG "%s: %s: " fmt , \ - MY_NAME , __func__ , \ - ## arg); \ - } while (0) - -static int rootplug_bprm_check_security (struct linux_binprm *bprm) -{ - struct usb_device *dev; - - root_dbg("file %s, e_uid = %d, e_gid = %d\n", - bprm->filename, bprm->cred->euid, bprm->cred->egid); - - if (bprm->cred->egid == 0) { - dev = usb_find_device(vendor_id, product_id); - if (!dev) { - root_dbg("e_gid = 0, and device not found, " - "task not allowed to run...\n"); - return -EPERM; - } - usb_put_dev(dev); - } - - return 0; -} - -static struct security_operations rootplug_security_ops = { - .bprm_check_security = rootplug_bprm_check_security, -}; - -static int __init rootplug_init (void) -{ - /* register ourselves with the security framework */ - if (register_security (&rootplug_security_ops)) { - printk (KERN_INFO - "Failure registering Root Plug module with the kernel\n"); - return -EINVAL; - } - printk (KERN_INFO "Root Plug module initialized, " - "vendor_id = %4.4x, product id = %4.4x\n", vendor_id, product_id); - return 0; -} - -security_initcall (rootplug_init); -- cgit v1.2.3-70-g09d2 From 6e8e16c7bc298d7887584c3d027e05db3e86eed9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 22 Oct 2009 15:38:26 -0400 Subject: SELinux: add .gitignore files for dynamic classes The SELinux dynamic class work in c6d3aaa4e35c71a32a86ececacd4eea7ecfc316c creates a number of dynamic header files and scripts. Add .gitignore files so git doesn't complain about these. Signed-off-by: Eric Paris Acked-by: Stephen D. Smalley Signed-off-by: James Morris --- Documentation/dontdiff | 3 +++ scripts/selinux/genheaders/.gitignore | 1 + security/selinux/.gitignore | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 scripts/selinux/genheaders/.gitignore create mode 100644 security/selinux/.gitignore (limited to 'Documentation') diff --git a/Documentation/dontdiff b/Documentation/dontdiff index e1efc400bed..e151b2a3626 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -65,6 +65,7 @@ aicdb.h* asm-offsets.h asm_offsets.h autoconf.h* +av_permissions.h bbootsect bin2c binkernel.spec @@ -95,12 +96,14 @@ docproc elf2ecoff elfconfig.h* fixdep +flask.h fore200e_mkfirm fore200e_pca_fw.c* gconf gen-devlist gen_crc32table gen_init_cpio +genheaders genksyms *_gray256.c ihex2fw diff --git a/scripts/selinux/genheaders/.gitignore b/scripts/selinux/genheaders/.gitignore new file mode 100644 index 00000000000..4c0b646ff8d --- /dev/null +++ b/scripts/selinux/genheaders/.gitignore @@ -0,0 +1 @@ +genheaders diff --git a/security/selinux/.gitignore b/security/selinux/.gitignore new file mode 100644 index 00000000000..2e5040a3d48 --- /dev/null +++ b/security/selinux/.gitignore @@ -0,0 +1,2 @@ +av_permissions.h +flask.h -- cgit v1.2.3-70-g09d2 From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001 From: Ryota Ozaki Date: Sat, 24 Oct 2009 01:20:10 +0900 Subject: sched, cpuacct: Fix niced guest time accounting CPU time of a guest is always accounted in 'user' time without concern for the nice value of its counterpart process although the guest is scheduled under the nice value. This patch fixes the defect and accounts cpu time of a niced guest in 'nice' time as same as a niced process. And also the patch adds 'guest_nice' to cpuacct. The value provides niced guest cpu time which is like 'nice' to 'user'. The original discussions can be found here: http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html Signed-off-by: Ryota Ozaki Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com> Signed-off-by: Ingo Molnar --- Documentation/filesystems/proc.txt | 3 ++- fs/proc/stat.c | 19 +++++++++++++------ include/linux/kernel_stat.h | 1 + kernel/sched.c | 9 +++++++-- 4 files changed, 23 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 2c48f945546..4af0018533f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1072,7 +1072,8 @@ second). The meanings of the columns are as follows, from left to right: - irq: servicing interrupts - softirq: servicing softirqs - steal: involuntary wait -- guest: running a guest +- guest: running a normal guest +- guest_nice: running a niced guest The "intr" line gives counts of interrupts serviced since boot time, for each of the possible system interrupts. The first column is the total of all diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 7cc726c6d70..b9b7aad2003 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v) int i, j; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest; + cputime64_t guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; @@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v) user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; - guest = cputime64_zero; + guest = guest_nice = cputime64_zero; getboottime(&boottime); jif = boottime.tv_sec; @@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + guest_nice = cputime64_add(guest_nice, + kstat_cpu(i).cpustat.guest_nice); for_each_irq_nr(j) { sum += kstat_irqs_cpu(j, i); } @@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v) softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; guest = kstat_cpu(i).cpustat.guest; + guest_nice = kstat_cpu(i).cpustat.guest_nice; seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); } seq_printf(p, "intr %llu", (unsigned long long)sum); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 348fa8874b5..c059044bc6d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -25,6 +25,7 @@ struct cpu_usage_stat { cputime64_t iowait; cputime64_t steal; cputime64_t guest; + cputime64_t guest_nice; }; struct kernel_stat { diff --git a/kernel/sched.c b/kernel/sched.c index e5205811c19..67be4d0ddda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* -- cgit v1.2.3-70-g09d2 From 64179861cb801eac4f00c79f39a29ea5ac9470d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 25 Oct 2009 19:03:53 -0700 Subject: rcu: Add synchronize_srcu_expedited() to the documentation Signed-off-by: Paul E. McKenney Acked-by: Josh Triplett Reviewed-by: Lai Jiangshan Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: avi@redhat.com Cc: mtosatti@redhat.com LKML-Reference: <12565226354176-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/whatisRCU.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index e41a7fecf0d..d542ca243b8 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -830,7 +830,7 @@ sched: Critical sections Grace period Barrier SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu N/A - srcu_read_unlock + srcu_read_unlock synchronize_srcu_expedited SRCU: Initialization/cleanup init_srcu_struct -- cgit v1.2.3-70-g09d2 From 45a5c8bad827ebb9c9798becc15bce2e804d49e0 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Wed, 4 Nov 2009 03:15:44 +0530 Subject: sched: Add USER_SCHED to feature removal list Peter Zijlstra suggested that we remove USER_SCHED at: http://lkml.org/lkml/2009/3/21/67 Removing USER_SCHED removes a lot of code from the scheduler and simplifies the code. We already have the ability to do user based classification which is tightened using PAM in userspace. Schedule USER_SCHED for removal in 2.6.34 Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Cc: Balbir Singh Cc: Bharata B Rao Cc: Serge E. Hallyn Cc: Srivatsa Vaddagiri LKML-Reference: <20091103214544.GI5495@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'Documentation') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index bc693fffabe..f613df8ec7b 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,6 +6,21 @@ be removed from this file. --------------------------- +What: USER_SCHED +When: 2.6.34 + +Why: USER_SCHED was implemented as a proof of concept for group scheduling. + The effect of USER_SCHED can already be achieved from userspace with + the help of libcgroup. The removal of USER_SCHED will also simplify + the scheduler code with the removal of one major ifdef. There are also + issues USER_SCHED has with USER_NS. A decision was taken not to fix + those and instead remove USER_SCHED. Also new group scheduling + features will not be implemented for USER_SCHED. + +Who: Dhaval Giani + +--------------------------- + What: PRISM54 When: 2.6.34 -- cgit v1.2.3-70-g09d2 From 77b44d1b7c28360910cdbd427fb62d485c08674c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Nov 2009 19:12:47 -0500 Subject: tracing/kprobes: Rename Kprobe-tracer to kprobe-event Rename Kprobes-based event tracer to kprobes-based tracing event (kprobe-event), since it is not a tracer but an extensible tracing event interface. This also changes CONFIG_KPROBE_TRACER to CONFIG_KPROBE_EVENT and sets it y by default. Signed-off-by: Masami Hiramatsu Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju LKML-Reference: <20091104001247.3454.14131.stgit@harusame> Signed-off-by: Ingo Molnar --- Documentation/trace/kprobetrace.txt | 34 ++++++++++++++++------------------ kernel/trace/Kconfig | 19 ++++++++++++------- kernel/trace/Makefile | 2 +- kernel/trace/trace_kprobe.c | 6 ++---- 4 files changed, 31 insertions(+), 30 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 15415243a9a..47aabeebbdf 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -1,26 +1,23 @@ - Kprobe-based Event Tracer - ========================= + Kprobe-based Event Tracing + ========================== Documentation is written by Masami Hiramatsu Overview -------- -This tracer is similar to the events tracer which is based on Tracepoint -infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe -and kretprobe). It probes anywhere where kprobes can probe(this means, all -functions body except for __kprobes functions). +These events are similar to tracepoint based events. Instead of Tracepoint, +this is based on kprobes (kprobe and kretprobe). So it can probe wherever +kprobes can probe (this means, all functions body except for __kprobes +functions). Unlike the Tracepoint based event, this can be added and removed +dynamically, on the fly. -Unlike the function tracer, this tracer can probe instructions inside of -kernel functions. It allows you to check which instruction has been executed. +To enable this feature, build your kernel with CONFIG_KPROBE_TRACING=y. -Unlike the Tracepoint based events tracer, this tracer can add and remove -probe points on the fly. - -Similar to the events tracer, this tracer doesn't need to be activated via -current_tracer, instead of that, just set probe points via -/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each -probe events via /sys/kernel/debug/tracing/events/kprobes//filter. +Similar to the events tracer, this doesn't need to be activated via +current_tracer. Instead of that, add probe points via +/sys/kernel/debug/tracing/kprobe_events, and enable it via +/sys/kernel/debug/tracing/events/kprobes//enabled. Synopsis of kprobe_events @@ -55,9 +52,9 @@ Per-Probe Event Filtering ------------------------- Per-probe event filtering feature allows you to set different filter on each probe and gives you what arguments will be shown in trace buffer. If an event -name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds -an event under tracing/events/kprobes/, at the directory you can see -'id', 'enabled', 'format' and 'filter'. +name is specified right after 'p:' or 'r:' in kprobe_events, it adds an event +under tracing/events/kprobes/, at the directory you can see 'id', +'enabled', 'format' and 'filter'. enabled: You can enable/disable the probe by writing 1 or 0 on it. @@ -71,6 +68,7 @@ filter: id: This shows the id of this probe event. + Event Profiling --------------- You can check the total number of probe hits and probe miss-hits via diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 15372a9f239..f05671609a8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -428,17 +428,22 @@ config BLK_DEV_IO_TRACE If unsure, say N. -config KPROBE_TRACER +config KPROBE_EVENT depends on KPROBES depends on X86 - bool "Trace kprobes" + bool "Enable kprobes-based dynamic events" select TRACING - select GENERIC_TRACER + default y help - This tracer probes everywhere where kprobes can probe it, and - records various registers and memories specified by user. - This also allows you to trace kprobe probe points as a dynamic - defined events. It provides per-probe event filtering interface. + This allows the user to add tracing events (similar to tracepoints) on the fly + via the ftrace interface. See Documentation/trace/kprobetrace.txt + for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. If + you want to use perf tools, this option is strongly recommended. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c8cb75d7f28..edc3a3cca1a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o -obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a86c3ac0df2..cf17a6694f3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1,5 +1,5 @@ /* - * kprobe based kernel tracer + * Kprobes-based tracing events * * Created by Masami Hiramatsu * @@ -57,8 +57,6 @@ const char *reserved_field_names[] = { FIELD_STRING_FUNC, }; -/* currently, trace_kprobe only supports X86. */ - struct fetch_func { unsigned long (*func)(struct pt_regs *, void *); void *data; @@ -191,7 +189,7 @@ static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) } /** - * Kprobe tracer core functions + * Kprobe event core functions */ struct probe_arg { -- cgit v1.2.3-70-g09d2 From 91284224da5b15ec6c2b45e10fa5eccd1c92a204 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 18 Oct 2009 23:32:33 +0200 Subject: pcmcia: add new CIS access helpers As a replacement to pcmcia_get_{first,next}_tuple() and pcmcia_get_tuple_data(), three new -- and easier to use -- functions are added: - pcmcia_get_tuple() to get the very first CIS entry of one type. - pcmcia_loop_tuple() to loop over all CIS entries of one type. - pcmcia_get_mac_from_cis() to read out the hardware MAC address from CISTPL_FUNCE. Only a handful of drivers need these functions anyway, as most CIS access is already handled by pcmcia_loop_config(), which now shares the same backed (pccard_loop_tuple()) with pcmcia_loop_tuple(). A pcmcia_get_mac_from_cis() bug noted by Komuro has been fixed in this revision. Signed-off-by: Dominik Brodowski --- Documentation/pcmcia/driver-changes.txt | 7 ++ drivers/pcmcia/cistpl.c | 61 +++++++++ drivers/pcmcia/cs_internal.h | 7 ++ drivers/pcmcia/pcmcia_resource.c | 217 +++++++++++++++++++++++++++----- drivers/pcmcia/rsrc_mgr.c | 1 + include/pcmcia/ds.h | 57 ++++++--- 6 files changed, 304 insertions(+), 46 deletions(-) (limited to 'Documentation') diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt index 059934363ca..adfb83e5867 100644 --- a/Documentation/pcmcia/driver-changes.txt +++ b/Documentation/pcmcia/driver-changes.txt @@ -1,5 +1,12 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: +* New CIS tuple access (as of 2.6.33) + Instead of pcmcia_get_{first,next}_tuple(), pcmcia_get_tuple_data() and + pcmcia_parse_tuple(), a driver shall use "pcmcia_get_tuple()" if it is + only interested in one (raw) tuple, or "pcmcia_loop_tuple()" if it is + interested in all tuples of one type. To decode the MAC from CISTPL_FUNCE, + a new helper "pcmcia_get_mac_from_cis()" was added. + * New configuration loop helper (as of 2.6.28) By calling pcmcia_loop_config(), a driver can iterate over all available configuration options. During a driver's probe() phase, one doesn't need diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c index 6c4a4fc8363..24dd3c1eac0 100644 --- a/drivers/pcmcia/cistpl.c +++ b/drivers/pcmcia/cistpl.c @@ -1482,6 +1482,67 @@ done: } EXPORT_SYMBOL(pccard_read_tuple); + +/** + * pccard_loop_tuple() - loop over tuples in the CIS + * @s: the struct pcmcia_socket where the card is inserted + * @function: the device function we loop for + * @code: which CIS code shall we look for? + * @parse: buffer where the tuple shall be parsed (or NULL, if no parse) + * @priv_data: private data to be passed to the loop_tuple function. + * @loop_tuple: function to call for each CIS entry of type @function. IT + * gets passed the raw tuple, the paresed tuple (if @parse is + * set) and @priv_data. + * + * pccard_loop_tuple() loops over all CIS entries of type @function, and + * calls the @loop_tuple function for each entry. If the call to @loop_tuple + * returns 0, the loop exits. Returns 0 on success or errorcode otherwise. + */ +int pccard_loop_tuple(struct pcmcia_socket *s, unsigned int function, + cisdata_t code, cisparse_t *parse, void *priv_data, + int (*loop_tuple) (tuple_t *tuple, + cisparse_t *parse, + void *priv_data)) +{ + tuple_t tuple; + cisdata_t *buf; + int ret; + + buf = kzalloc(256, GFP_KERNEL); + if (buf == NULL) { + dev_printk(KERN_WARNING, &s->dev, "no memory to read tuple\n"); + return -ENOMEM; + } + + tuple.TupleData = buf; + tuple.TupleDataMax = 255; + tuple.TupleOffset = 0; + tuple.DesiredTuple = code; + tuple.Attributes = 0; + + ret = pccard_get_first_tuple(s, function, &tuple); + while (!ret) { + if (pccard_get_tuple_data(s, &tuple)) + goto next_entry; + + if (parse) + if (pcmcia_parse_tuple(&tuple, parse)) + goto next_entry; + + ret = loop_tuple(&tuple, parse, priv_data); + if (!ret) + break; + +next_entry: + ret = pccard_get_next_tuple(s, function, &tuple); + } + + kfree(buf); + return ret; +} +EXPORT_SYMBOL(pccard_loop_tuple); + + /*====================================================================== This tries to determine if a card has a sensible CIS. It returns diff --git a/drivers/pcmcia/cs_internal.h b/drivers/pcmcia/cs_internal.h index 1f4098f1354..06a14c951e9 100644 --- a/drivers/pcmcia/cs_internal.h +++ b/drivers/pcmcia/cs_internal.h @@ -199,6 +199,13 @@ int pcmcia_replace_cis(struct pcmcia_socket *s, const u8 *data, const size_t len); int pccard_validate_cis(struct pcmcia_socket *s, unsigned int *count); +/* loop over CIS entries */ +int pccard_loop_tuple(struct pcmcia_socket *s, unsigned int function, + cisdata_t code, cisparse_t *parse, void *priv_data, + int (*loop_tuple) (tuple_t *tuple, + cisparse_t *parse, + void *priv_data)); + /* rsrc_mgr.c */ int pcmcia_validate_mem(struct pcmcia_socket *s); struct resource *pcmcia_find_io_region(unsigned long base, diff --git a/drivers/pcmcia/pcmcia_resource.c b/drivers/pcmcia/pcmcia_resource.c index d919e96c0af..0bfb05aa8f8 100644 --- a/drivers/pcmcia/pcmcia_resource.c +++ b/drivers/pcmcia/pcmcia_resource.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -885,12 +886,39 @@ EXPORT_SYMBOL(pcmcia_disable_device); struct pcmcia_cfg_mem { - tuple_t tuple; + struct pcmcia_device *p_dev; + void *priv_data; + int (*conf_check) (struct pcmcia_device *p_dev, + cistpl_cftable_entry_t *cfg, + cistpl_cftable_entry_t *dflt, + unsigned int vcc, + void *priv_data); cisparse_t parse; - u8 buf[256]; cistpl_cftable_entry_t dflt; }; +/** + * pcmcia_do_loop_config() - internal helper for pcmcia_loop_config() + * + * pcmcia_do_loop_config() is the internal callback for the call from + * pcmcia_loop_config() to pccard_loop_tuple(). Data is transferred + * by a struct pcmcia_cfg_mem. + */ +static int pcmcia_do_loop_config(tuple_t *tuple, cisparse_t *parse, void *priv) +{ + cistpl_cftable_entry_t *cfg = &parse->cftable_entry; + struct pcmcia_cfg_mem *cfg_mem = priv; + + /* default values */ + cfg_mem->p_dev->conf.ConfigIndex = cfg->index; + if (cfg->flags & CISTPL_CFTABLE_DEFAULT) + cfg_mem->dflt = *cfg; + + return cfg_mem->conf_check(cfg_mem->p_dev, cfg, &cfg_mem->dflt, + cfg_mem->p_dev->socket->socket.Vcc, + cfg_mem->priv_data); +} + /** * pcmcia_loop_config() - loop over configuration options * @p_dev: the struct pcmcia_device which we need to loop for. @@ -913,48 +941,173 @@ int pcmcia_loop_config(struct pcmcia_device *p_dev, void *priv_data) { struct pcmcia_cfg_mem *cfg_mem; - - tuple_t *tuple; int ret; - unsigned int vcc; cfg_mem = kzalloc(sizeof(struct pcmcia_cfg_mem), GFP_KERNEL); if (cfg_mem == NULL) return -ENOMEM; - /* get the current Vcc setting */ - vcc = p_dev->socket->socket.Vcc; + cfg_mem->p_dev = p_dev; + cfg_mem->conf_check = conf_check; + cfg_mem->priv_data = priv_data; - tuple = &cfg_mem->tuple; - tuple->TupleData = cfg_mem->buf; - tuple->TupleDataMax = 255; - tuple->TupleOffset = 0; - tuple->DesiredTuple = CISTPL_CFTABLE_ENTRY; - tuple->Attributes = 0; + ret = pccard_loop_tuple(p_dev->socket, p_dev->func, + CISTPL_CFTABLE_ENTRY, &cfg_mem->parse, + cfg_mem, pcmcia_do_loop_config); - ret = pcmcia_get_first_tuple(p_dev, tuple); - while (!ret) { - cistpl_cftable_entry_t *cfg = &cfg_mem->parse.cftable_entry; + kfree(cfg_mem); + return ret; +} +EXPORT_SYMBOL(pcmcia_loop_config); - if (pcmcia_get_tuple_data(p_dev, tuple)) - goto next_entry; - if (pcmcia_parse_tuple(tuple, &cfg_mem->parse)) - goto next_entry; +struct pcmcia_loop_mem { + struct pcmcia_device *p_dev; + void *priv_data; + int (*loop_tuple) (struct pcmcia_device *p_dev, + tuple_t *tuple, + void *priv_data); +}; - /* default values */ - p_dev->conf.ConfigIndex = cfg->index; - if (cfg->flags & CISTPL_CFTABLE_DEFAULT) - cfg_mem->dflt = *cfg; +/** + * pcmcia_do_loop_tuple() - internal helper for pcmcia_loop_config() + * + * pcmcia_do_loop_tuple() is the internal callback for the call from + * pcmcia_loop_tuple() to pccard_loop_tuple(). Data is transferred + * by a struct pcmcia_cfg_mem. + */ +static int pcmcia_do_loop_tuple(tuple_t *tuple, cisparse_t *parse, void *priv) +{ + struct pcmcia_loop_mem *loop = priv; - ret = conf_check(p_dev, cfg, &cfg_mem->dflt, vcc, priv_data); - if (!ret) - break; + return loop->loop_tuple(loop->p_dev, tuple, loop->priv_data); +}; + +/** + * pcmcia_loop_tuple() - loop over tuples in the CIS + * @p_dev: the struct pcmcia_device which we need to loop for. + * @code: which CIS code shall we look for? + * @priv_data: private data to be passed to the loop_tuple function. + * @loop_tuple: function to call for each CIS entry of type @function. IT + * gets passed the raw tuple and @priv_data. + * + * pcmcia_loop_tuple() loops over all CIS entries of type @function, and + * calls the @loop_tuple function for each entry. If the call to @loop_tuple + * returns 0, the loop exits. Returns 0 on success or errorcode otherwise. + */ +int pcmcia_loop_tuple(struct pcmcia_device *p_dev, cisdata_t code, + int (*loop_tuple) (struct pcmcia_device *p_dev, + tuple_t *tuple, + void *priv_data), + void *priv_data) +{ + struct pcmcia_loop_mem loop = { + .p_dev = p_dev, + .loop_tuple = loop_tuple, + .priv_data = priv_data}; + + return pccard_loop_tuple(p_dev->socket, p_dev->func, code, NULL, + &loop, pcmcia_do_loop_tuple); +}; +EXPORT_SYMBOL(pcmcia_loop_tuple); -next_entry: - ret = pcmcia_get_next_tuple(p_dev, tuple); + +struct pcmcia_loop_get { + size_t len; + cisdata_t **buf; +}; + +/** + * pcmcia_do_get_tuple() - internal helper for pcmcia_get_tuple() + * + * pcmcia_do_get_tuple() is the internal callback for the call from + * pcmcia_get_tuple() to pcmcia_loop_tuple(). As we're only interested in + * the first tuple, return 0 unconditionally. Create a memory buffer large + * enough to hold the content of the tuple, and fill it with the tuple data. + * The caller is responsible to free the buffer. + */ +static int pcmcia_do_get_tuple(struct pcmcia_device *p_dev, tuple_t *tuple, + void *priv) +{ + struct pcmcia_loop_get *get = priv; + + *get->buf = kzalloc(tuple->TupleDataLen, GFP_KERNEL); + if (*get->buf) { + get->len = tuple->TupleDataLen; + memcpy(*get->buf, tuple->TupleData, tuple->TupleDataLen); } + return 0; +}; + +/** + * pcmcia_get_tuple() - get first tuple from CIS + * @p_dev: the struct pcmcia_device which we need to loop for. + * @code: which CIS code shall we look for? + * @buf: pointer to store the buffer to. + * + * pcmcia_get_tuple() gets the content of the first CIS entry of type @code. + * It returns the buffer length (or zero). The caller is responsible to free + * the buffer passed in @buf. + */ +size_t pcmcia_get_tuple(struct pcmcia_device *p_dev, cisdata_t code, + unsigned char **buf) +{ + struct pcmcia_loop_get get = { + .len = 0, + .buf = buf, + }; + + *get.buf = NULL; + pcmcia_loop_tuple(p_dev, code, pcmcia_do_get_tuple, &get); + + return get.len; +}; +EXPORT_SYMBOL(pcmcia_get_tuple); + + +/** + * pcmcia_do_get_mac() - internal helper for pcmcia_get_mac_from_cis() + * + * pcmcia_do_get_mac() is the internal callback for the call from + * pcmcia_get_mac_from_cis() to pcmcia_loop_tuple(). We check whether the + * tuple contains a proper LAN_NODE_ID of length 6, and copy the data + * to struct net_device->dev_addr[i]. + */ +static int pcmcia_do_get_mac(struct pcmcia_device *p_dev, tuple_t *tuple, + void *priv) +{ + struct net_device *dev = priv; + int i; + + if (tuple->TupleData[0] != CISTPL_FUNCE_LAN_NODE_ID) + return -EINVAL; + if (tuple->TupleDataLen < ETH_ALEN + 2) { + dev_warn(&p_dev->dev, "Invalid CIS tuple length for " + "LAN_NODE_ID\n"); + return -EINVAL; + } + + if (tuple->TupleData[1] != ETH_ALEN) { + dev_warn(&p_dev->dev, "Invalid header for LAN_NODE_ID\n"); + return -EINVAL; + } + for (i = 0; i < 6; i++) + dev->dev_addr[i] = tuple->TupleData[i+2]; + return 0; +}; + +/** + * pcmcia_get_mac_from_cis() - read out MAC address from CISTPL_FUNCE + * @p_dev: the struct pcmcia_device for which we want the address. + * @dev: a properly prepared struct net_device to store the info to. + * + * pcmcia_get_mac_from_cis() reads out the hardware MAC address from + * CISTPL_FUNCE and stores it into struct net_device *dev->dev_addr which + * must be set up properly by the driver (see examples!). + */ +int pcmcia_get_mac_from_cis(struct pcmcia_device *p_dev, struct net_device *dev) +{ + return pcmcia_loop_tuple(p_dev, CISTPL_FUNCE, pcmcia_do_get_mac, dev); +}; +EXPORT_SYMBOL(pcmcia_get_mac_from_cis); - return ret; -} -EXPORT_SYMBOL(pcmcia_loop_config); diff --git a/drivers/pcmcia/rsrc_mgr.c b/drivers/pcmcia/rsrc_mgr.c index e592e0e0d7e..de0e770ce6a 100644 --- a/drivers/pcmcia/rsrc_mgr.c +++ b/drivers/pcmcia/rsrc_mgr.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cs_internal.h" diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h index a2be80b9a09..2eb6e24d1a6 100644 --- a/include/pcmcia/ds.h +++ b/include/pcmcia/ds.h @@ -34,6 +34,7 @@ struct pcmcia_socket; struct pcmcia_device; struct config_t; +struct net_device; /* dynamic device IDs for PCMCIA device drivers. See * Documentation/pcmcia/driver.txt for details. @@ -176,26 +177,39 @@ const char *pcmcia_error_ret(int ret); pcmcia_error_ret(ret)); \ } -/* CIS access. - * Use the pcmcia_* versions in PCMCIA drivers + +/* + * CIS access. + * + * Please use the following functions to access CIS tuples: + * - pcmcia_get_tuple() + * - pcmcia_loop_tuple() + * - pcmcia_get_mac_from_cis() + * + * To parse a tuple_t, pcmcia_parse_tuple() exists. Its interface + * might change in future. */ -int pcmcia_parse_tuple(tuple_t *tuple, cisparse_t *parse); -int pccard_get_first_tuple(struct pcmcia_socket *s, unsigned int function, - tuple_t *tuple); -#define pcmcia_get_first_tuple(p_dev, tuple) \ - pccard_get_first_tuple(p_dev->socket, p_dev->func, tuple) +/* get the very first CIS entry of type @code. Note that buf is pointer + * to u8 *buf; and that you need to kfree(buf) afterwards. */ +size_t pcmcia_get_tuple(struct pcmcia_device *p_dev, cisdata_t code, + u8 **buf); -int pccard_get_next_tuple(struct pcmcia_socket *s, unsigned int function, - tuple_t *tuple); -#define pcmcia_get_next_tuple(p_dev, tuple) \ - pccard_get_next_tuple(p_dev->socket, p_dev->func, tuple) +/* loop over CIS entries */ +int pcmcia_loop_tuple(struct pcmcia_device *p_dev, cisdata_t code, + int (*loop_tuple) (struct pcmcia_device *p_dev, + tuple_t *tuple, + void *priv_data), + void *priv_data); -int pccard_get_tuple_data(struct pcmcia_socket *s, tuple_t *tuple); -#define pcmcia_get_tuple_data(p_dev, tuple) \ - pccard_get_tuple_data(p_dev->socket, tuple) +/* get the MAC address from CISTPL_FUNCE */ +int pcmcia_get_mac_from_cis(struct pcmcia_device *p_dev, + struct net_device *dev); +/* parse a tuple_t */ +int pcmcia_parse_tuple(tuple_t *tuple, cisparse_t *parse); + /* loop CIS entries for valid configuration */ int pcmcia_loop_config(struct pcmcia_device *p_dev, int (*conf_check) (struct pcmcia_device *p_dev, @@ -215,6 +229,21 @@ int pcmcia_reset_card(struct pcmcia_socket *skt); int pcmcia_access_configuration_register(struct pcmcia_device *p_dev, conf_reg_t *reg); +/* deprecated -- do not use in drivers. */ +int pccard_get_first_tuple(struct pcmcia_socket *s, unsigned int function, + tuple_t *tuple); +#define pcmcia_get_first_tuple(p_dev, tuple) \ + pccard_get_first_tuple(p_dev->socket, p_dev->func, tuple) + +int pccard_get_next_tuple(struct pcmcia_socket *s, unsigned int function, + tuple_t *tuple); +#define pcmcia_get_next_tuple(p_dev, tuple) \ + pccard_get_next_tuple(p_dev->socket, p_dev->func, tuple) + +int pccard_get_tuple_data(struct pcmcia_socket *s, tuple_t *tuple); +#define pcmcia_get_tuple_data(p_dev, tuple) \ + pccard_get_tuple_data(p_dev->socket, tuple) + /* device configuration */ int pcmcia_request_io(struct pcmcia_device *p_dev, io_req_t *req); int pcmcia_request_irq(struct pcmcia_device *p_dev, irq_req_t *req); -- cgit v1.2.3-70-g09d2 From 9cb495bb41f07a3ebfc60d3b9d26017a1fd7050c Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 24 Oct 2009 15:57:22 +0200 Subject: pcmcia: remove now-defunct cs_error, pcmcia_error_{func,ret} As all in-tree drivers have been converted to not use cs_error() any more, drop these functions and definitions, and update the Documentation. Signed-off-by: Dominik Brodowski --- Documentation/pcmcia/driver-changes.txt | 5 ++ drivers/pcmcia/ds.c | 101 -------------------------------- include/pcmcia/ds.h | 36 ------------ 3 files changed, 5 insertions(+), 137 deletions(-) (limited to 'Documentation') diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt index adfb83e5867..446f43b309d 100644 --- a/Documentation/pcmcia/driver-changes.txt +++ b/Documentation/pcmcia/driver-changes.txt @@ -1,5 +1,10 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: +* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) + Instead of the cs_error() callback or the CS_CHECK() macro, please use + Linux-style checking of return values, and -- if necessary -- debug + messages using "dev_dbg()" or "pr_debug()". + * New CIS tuple access (as of 2.6.33) Instead of pcmcia_get_{first,next}_tuple(), pcmcia_get_tuple_data() and pcmcia_parse_tuple(), a driver shall use "pcmcia_get_tuple()" if it is diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c index 5b069aeaf17..05893d41dd4 100644 --- a/drivers/pcmcia/ds.c +++ b/drivers/pcmcia/ds.c @@ -46,107 +46,6 @@ spinlock_t pcmcia_dev_list_lock; /*====================================================================*/ -/* code which was in cs.c before */ - -/* String tables for error messages */ - -typedef struct lookup_t { - const int key; - const char *msg; -} lookup_t; - -static const lookup_t error_table[] = { - { 0, "Operation succeeded" }, - { -EIO, "Input/Output error" }, - { -ENODEV, "No card present" }, - { -EINVAL, "Bad parameter" }, - { -EACCES, "Configuration locked" }, - { -EBUSY, "Resource in use" }, - { -ENOSPC, "No more items" }, - { -ENOMEM, "Out of resource" }, -}; - - -static const lookup_t service_table[] = { - { AccessConfigurationRegister, "AccessConfigurationRegister" }, - { AddSocketServices, "AddSocketServices" }, - { AdjustResourceInfo, "AdjustResourceInfo" }, - { CheckEraseQueue, "CheckEraseQueue" }, - { CloseMemory, "CloseMemory" }, - { DeregisterClient, "DeregisterClient" }, - { DeregisterEraseQueue, "DeregisterEraseQueue" }, - { GetCardServicesInfo, "GetCardServicesInfo" }, - { GetClientInfo, "GetClientInfo" }, - { GetConfigurationInfo, "GetConfigurationInfo" }, - { GetEventMask, "GetEventMask" }, - { GetFirstClient, "GetFirstClient" }, - { GetFirstRegion, "GetFirstRegion" }, - { GetFirstTuple, "GetFirstTuple" }, - { GetNextClient, "GetNextClient" }, - { GetNextRegion, "GetNextRegion" }, - { GetNextTuple, "GetNextTuple" }, - { GetStatus, "GetStatus" }, - { GetTupleData, "GetTupleData" }, - { MapMemPage, "MapMemPage" }, - { ModifyConfiguration, "ModifyConfiguration" }, - { ModifyWindow, "ModifyWindow" }, - { OpenMemory, "OpenMemory" }, - { ParseTuple, "ParseTuple" }, - { ReadMemory, "ReadMemory" }, - { RegisterClient, "RegisterClient" }, - { RegisterEraseQueue, "RegisterEraseQueue" }, - { RegisterMTD, "RegisterMTD" }, - { ReleaseConfiguration, "ReleaseConfiguration" }, - { ReleaseIO, "ReleaseIO" }, - { ReleaseIRQ, "ReleaseIRQ" }, - { ReleaseWindow, "ReleaseWindow" }, - { RequestConfiguration, "RequestConfiguration" }, - { RequestIO, "RequestIO" }, - { RequestIRQ, "RequestIRQ" }, - { RequestSocketMask, "RequestSocketMask" }, - { RequestWindow, "RequestWindow" }, - { ResetCard, "ResetCard" }, - { SetEventMask, "SetEventMask" }, - { ValidateCIS, "ValidateCIS" }, - { WriteMemory, "WriteMemory" }, - { BindDevice, "BindDevice" }, - { BindMTD, "BindMTD" }, - { ReportError, "ReportError" }, - { SuspendCard, "SuspendCard" }, - { ResumeCard, "ResumeCard" }, - { EjectCard, "EjectCard" }, - { InsertCard, "InsertCard" }, - { ReplaceCIS, "ReplaceCIS" } -}; - -const char *pcmcia_error_func(int func) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(service_table); i++) - if (service_table[i].key == func) - return service_table[i].msg; - - return "Unknown service number"; -} -EXPORT_SYMBOL(pcmcia_error_func); - -const char *pcmcia_error_ret(int ret) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(error_table); i++) - if (error_table[i].key == ret) - return error_table[i].msg; - - return "unknown"; -} -EXPORT_SYMBOL(pcmcia_error_ret); - -/*======================================================================*/ - - - static void pcmcia_check_driver(struct pcmcia_driver *p_drv) { struct pcmcia_device_id *did = p_drv->id_table; diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h index 6c37d4ed783..d82392de4e9 100644 --- a/include/pcmcia/ds.h +++ b/include/pcmcia/ds.h @@ -142,42 +142,6 @@ struct pcmcia_device { #define handle_to_dev(handle) (handle->dev) -/* (deprecated) error reporting by PCMCIA devices. Use dev_printk() - * or dev_dbg() directly in the driver, without referring to pcmcia_error_func() - * and/or pcmcia_error_ret() for those functions will go away soon. - */ -enum service { - AccessConfigurationRegister, AddSocketServices, - AdjustResourceInfo, CheckEraseQueue, CloseMemory, CopyMemory, - DeregisterClient, DeregisterEraseQueue, GetCardServicesInfo, - GetClientInfo, GetConfigurationInfo, GetEventMask, - GetFirstClient, GetFirstPartion, GetFirstRegion, GetFirstTuple, - GetNextClient, GetNextPartition, GetNextRegion, GetNextTuple, - GetStatus, GetTupleData, MapLogSocket, MapLogWindow, MapMemPage, - MapPhySocket, MapPhyWindow, ModifyConfiguration, ModifyWindow, - OpenMemory, ParseTuple, ReadMemory, RegisterClient, - RegisterEraseQueue, RegisterMTD, RegisterTimer, - ReleaseConfiguration, ReleaseExclusive, ReleaseIO, ReleaseIRQ, - ReleaseSocketMask, ReleaseWindow, ReplaceSocketServices, - RequestConfiguration, RequestExclusive, RequestIO, RequestIRQ, - RequestSocketMask, RequestWindow, ResetCard, ReturnSSEntry, - SetEventMask, SetRegion, ValidateCIS, VendorSpecific, - WriteMemory, BindDevice, BindMTD, ReportError, - SuspendCard, ResumeCard, EjectCard, InsertCard, ReplaceCIS, - GetFirstWindow, GetNextWindow, GetMemPage -}; -const char *pcmcia_error_func(int func); -const char *pcmcia_error_ret(int ret); - -#define cs_error(p_dev, func, ret) \ - { \ - dev_printk(KERN_NOTICE, &p_dev->dev, \ - "%s : %s\n", \ - pcmcia_error_func(func), \ - pcmcia_error_ret(ret)); \ - } - - /* * CIS access. * -- cgit v1.2.3-70-g09d2 From f84d49b218b7d4c6cba2e0b41f24bd4045403962 Mon Sep 17 00:00:00 2001 From: Naohiro Ooiwa Date: Mon, 9 Nov 2009 00:46:42 +0900 Subject: signal: Print warning message when dropping signals When the system has too many timers or too many aggregate queued signals, the EAGAIN error is returned to application from kernel, including timer_create() [POSIX.1b]. It means that the app exceeded the limit of pending signals, but in general application writers do not expect this outcome and the current silent failure can cause rare app failures under very high load. This patch adds a new message when we reach the limit and if print_fatal_signals is enabled: task/1234: reached RLIMIT_SIGPENDING, dropping signal If you see this message and your system behaved unexpectedly, you can run following command to lift the limit: # ulimit -i unlimited With help from Hiroshi Shimamoto . Signed-off-by: Naohiro Ooiwa Cc: Andrew Morton Cc: Hiroshi Shimamoto Cc: Roland McGrath Cc: Peter Zijlstra Cc: oleg@redhat.com LKML-Reference: <4AF6E7E2.9080406@miraclelinux.com> [ Modified a few small details, gave surrounding code some love. ] Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 11 +++++++-- kernel/signal.c | 46 ++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 15 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9107b387e91..3bbd92f805a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2032,8 +2032,15 @@ and is between 256 and 4096 characters. It is defined in the file print-fatal-signals= [KNL] debug: print fatal signals - print-fatal-signals=1: print segfault info to - the kernel console. + + If enabled, warn about various signal handling + related application anomalies: too many signals, + too many POSIX.1 timers, fatal signals causing a + coredump - etc. + + If you hit the warning due to signal overflow, + you might want to try "ulimit -i unlimited". + default: off. printk.time= Show timing data prefixed to each printk message line diff --git a/kernel/signal.c b/kernel/signal.c index 6705320784f..fe08008133d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,8 @@ static struct kmem_cache *sigqueue_cachep; +int print_fatal_signals __read_mostly; + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; @@ -159,7 +162,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; - + s = pending->signal.sig; m = mask->sig; switch (_NSIG_WORDS) { @@ -184,17 +187,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask) sig = ffz(~x) + 1; break; } - + return sig; } +static inline void print_dropped_signal(int sig) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!print_fatal_signals) + return; + + if (!__ratelimit(&ratelimit_state)) + return; + + printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + current->comm, current->pid, sig); +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appopriate lock must be held to stop the target task from exiting */ -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - int override_rlimit) +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; struct user_struct *user; @@ -207,10 +224,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, */ user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } + if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); free_uid(user); @@ -869,7 +891,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, + q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); @@ -925,8 +947,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, return __send_signal(sig, info, t, group, from_ancestor_ns); } -int print_fatal_signals; - static void print_fatal_signal(struct pt_regs *regs, int signr) { printk("%s/%d: potentially unexpected fatal signal %d.\n", @@ -1293,19 +1313,19 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of Posix Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. */ - struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q; + struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); - if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) + if (q) q->flags |= SIGQUEUE_PREALLOC; - return(q); + + return q; } void sigqueue_free(struct sigqueue *q) -- cgit v1.2.3-70-g09d2 From f6630114d9198aa959ac95c131334c020038f253 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:15 -0600 Subject: sched: Limit the number of scheduler debug messages Remove the verbose scheduler debug messages unless kernel parameter "sched_debug" set. /proc/sched_debug unchanged. Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002221.489305000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ kernel/sched.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9107b387e91..f2a9507b27b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2182,6 +2182,8 @@ and is between 256 and 4096 characters. It is defined in the file sbni= [NET] Granch SBNI12 leased line adapter + sched_debug [KNL] Enables verbose scheduler debug messages. + sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver Format: [,[,]] diff --git a/kernel/sched.c b/kernel/sched.c index a57c6aee6d4..48ff66a6892 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7720,6 +7720,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7806,6 +7816,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; -- cgit v1.2.3-70-g09d2 From d1eb650ff4130972fa21462fa49cd35a2865403b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 24 Nov 2009 16:56:45 -0500 Subject: tracepoint: Move signal sending tracepoint to events/signal.h Move signal sending event to events/signal.h. This patch also renames sched_signal_send event to signal_generate. Changes in v4: - Fix a typo of task_struct pointer. Changes in v3: - Add docbook style comments Changes in v2: - Add siginfo argument - Add siginfo storing macro Signed-off-by: Masami Hiramatsu Reviewed-by: Jason Baron Acked-by: Roland McGrath Cc: systemtap Cc: DLE Cc: Oleg Nesterov LKML-Reference: <20091124215645.30449.60208.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/tracepoint.tmpl | 5 +++ include/trace/events/sched.h | 25 ------------- include/trace/events/signal.h | 66 +++++++++++++++++++++++++++++++++++ kernel/signal.c | 5 +-- 4 files changed, 74 insertions(+), 27 deletions(-) create mode 100644 include/trace/events/signal.h (limited to 'Documentation') diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl index b0756d0fd57..8bca1d5cec0 100644 --- a/Documentation/DocBook/tracepoint.tmpl +++ b/Documentation/DocBook/tracepoint.tmpl @@ -86,4 +86,9 @@ !Iinclude/trace/events/irq.h + + SIGNAL +!Iinclude/trace/events/signal.h + + diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9d316b22388..cfceb0b73e2 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -287,31 +287,6 @@ TRACE_EVENT(sched_process_fork, __entry->child_comm, __entry->child_pid) ); -/* - * Tracepoint for sending a signal: - */ -TRACE_EVENT(sched_signal_send, - - TP_PROTO(int sig, struct task_struct *p), - - TP_ARGS(sig, p), - - TP_STRUCT__entry( - __field( int, sig ) - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->sig = sig; - ), - - TP_printk("sig=%d comm=%s pid=%d", - __entry->sig, __entry->comm, __entry->pid) -); - /* * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE * adding sched_stat support to SCHED_FIFO/RR would be welcome. diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h new file mode 100644 index 00000000000..ef51756a801 --- /dev/null +++ b/include/trace/events/signal.h @@ -0,0 +1,66 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM signal + +#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SIGNAL_H + +#include +#include +#include + +#define TP_STORE_SIGINFO(__entry, info) \ + do { \ + if (info == SEND_SIG_NOINFO) { \ + __entry->errno = 0; \ + __entry->code = SI_USER; \ + } else if (info == SEND_SIG_PRIV) { \ + __entry->errno = 0; \ + __entry->code = SI_KERNEL; \ + } else { \ + __entry->errno = info->si_errno; \ + __entry->code = info->si_code; \ + } \ + } while (0) + +/** + * signal_generate - called when a signal is generated + * @sig: signal number + * @info: pointer to struct siginfo + * @task: pointer to struct task_struct + * + * Current process sends a 'sig' signal to 'task' process with + * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, + * 'info' is not a pointer and you can't access its field. Instead, + * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV + * means that si_code is SI_KERNEL. + */ +TRACE_EVENT(signal_generate, + + TP_PROTO(int sig, struct siginfo *info, struct task_struct *task), + + TP_ARGS(sig, info, task), + + TP_STRUCT__entry( + __field( int, sig ) + __field( int, errno ) + __field( int, code ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + __entry->sig = sig; + TP_STORE_SIGINFO(__entry, info); + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + ), + + TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d", + __entry->sig, __entry->errno, __entry->code, + __entry->comm, __entry->pid) +); + +#endif /* _TRACE_SIGNAL_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/signal.c b/kernel/signal.c index 6705320784f..a1e0cc6b32c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,8 @@ #include #include #include -#include +#define CREATE_TRACE_POINTS +#include #include #include @@ -834,7 +835,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigqueue *q; int override_rlimit; - trace_sched_signal_send(sig, t); + trace_signal_generate(sig, info, t); assert_spin_locked(&t->sighand->siglock); -- cgit v1.2.3-70-g09d2 From f13a48bd798a159291ca583b95453171b88b7448 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Dec 2009 15:36:11 +0000 Subject: SLOW_WORK: Move slow_work's proc file to debugfs Move slow_work's debugging proc file to debugfs. Signed-off-by: David Howells Requested-and-acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- Documentation/slow-work.txt | 4 +- include/linux/slow-work.h | 8 +- init/Kconfig | 8 +- kernel/Makefile | 2 +- kernel/slow-work-debugfs.c | 227 ++++++++++++++++++++++++++++++++++++++++++++ kernel/slow-work-proc.c | 227 -------------------------------------------- kernel/slow-work.c | 18 ++-- kernel/slow-work.h | 6 +- 8 files changed, 253 insertions(+), 247 deletions(-) create mode 100644 kernel/slow-work-debugfs.c delete mode 100644 kernel/slow-work-proc.c (limited to 'Documentation') diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt index 52bc3143372..9dbf4470c7e 100644 --- a/Documentation/slow-work.txt +++ b/Documentation/slow-work.txt @@ -279,9 +279,9 @@ The slow-work thread pool has a number of configurables: VIEWING EXECUTING AND QUEUED ITEMS ================================== -If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available: +If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available: - /proc/slow_work_rq + /sys/kernel/debug/slow_work/runqueue through which the list of work items being executed and the queues of items to be executed may be viewed. The owner of a work item is given the chance to diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index 5035a269173..13337bf6c3f 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -20,7 +20,7 @@ #include struct slow_work; -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG struct seq_file; #endif @@ -42,8 +42,8 @@ struct slow_work_ops { /* execute a work item */ void (*execute)(struct slow_work *work); -#ifdef CONFIG_SLOW_WORK_PROC - /* describe a work item for /proc */ +#ifdef CONFIG_SLOW_WORK_DEBUG + /* describe a work item for debugfs */ void (*desc)(struct slow_work *work, struct seq_file *m); #endif }; @@ -64,7 +64,7 @@ struct slow_work { #define SLOW_WORK_DELAYED 5 /* item is struct delayed_slow_work with active timer */ const struct slow_work_ops *ops; /* operations table for this item */ struct list_head link; /* link in queue */ -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG struct timespec mark; /* jiffies at which queued or exec begun */ #endif }; diff --git a/init/Kconfig b/init/Kconfig index ab5c64801fe..39923ccc287 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1098,12 +1098,12 @@ config SLOW_WORK See Documentation/slow-work.txt. -config SLOW_WORK_PROC - bool "Slow work debugging through /proc" +config SLOW_WORK_DEBUG + bool "Slow work debugging through debugfs" default n - depends on SLOW_WORK && PROC_FS + depends on SLOW_WORK && DEBUG_FS help - Display the contents of the slow work run queue through /proc, + Display the contents of the slow work run queue through debugfs, including items currently executing. See Documentation/slow-work.txt. diff --git a/kernel/Makefile b/kernel/Makefile index 776ffed1556..d7c13d249b2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o +obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c new file mode 100644 index 00000000000..e45c4364529 --- /dev/null +++ b/kernel/slow-work-debugfs.c @@ -0,0 +1,227 @@ +/* Slow work debugging + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "slow-work.h" + +#define ITERATOR_SHIFT (BITS_PER_LONG - 4) +#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) +#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) + +void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) +{ + seq_puts(m, "Slow-work: New thread"); +} + +/* + * Render the time mark field on a work item into a 5-char time with units plus + * a space + */ +static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) +{ + struct timespec now, diff; + + now = CURRENT_TIME; + diff = timespec_sub(now, work->mark); + + if (diff.tv_sec < 0) + seq_puts(m, " -ve "); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) + seq_printf(m, "%3luns ", diff.tv_nsec); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) + seq_printf(m, "%3luus ", diff.tv_nsec / 1000); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) + seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); + else if (diff.tv_sec <= 1) + seq_puts(m, " 1s "); + else if (diff.tv_sec < 60) + seq_printf(m, "%4lus ", diff.tv_sec); + else if (diff.tv_sec < 60 * 60) + seq_printf(m, "%4lum ", diff.tv_sec / 60); + else if (diff.tv_sec < 60 * 60 * 24) + seq_printf(m, "%4luh ", diff.tv_sec / 3600); + else + seq_puts(m, "exces "); +} + +/* + * Describe a slow work item for debugfs + */ +static int slow_work_runqueue_show(struct seq_file *m, void *v) +{ + struct slow_work *work; + struct list_head *p = v; + unsigned long id; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); + return 0; + case 2: + seq_puts(m, "=== ===== ================ == ===== ==========\n"); + return 0; + + case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: + id = (unsigned long) v - 3; + + read_lock(&slow_work_execs_lock); + work = slow_work_execs[id]; + if (work) { + smp_read_barrier_depends(); + + seq_printf(m, "%3lu %5d %16p %2lx ", + id, slow_work_pids[id], work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + } + read_unlock(&slow_work_execs_lock); + return 0; + + default: + work = list_entry(p, struct slow_work, link); + seq_printf(m, "%3s - %16p %2lx ", + work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", + work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + return 0; + } +} + +/* + * map the iterator to a work item + */ +static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) +{ + struct list_head *p; + unsigned long count, id; + + switch (*_pos >> ITERATOR_SHIFT) { + case 0x0: + if (*_pos == 0) + *_pos = 1; + if (*_pos < 3) + return (void *)(unsigned long) *_pos; + if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) + for (id = *_pos - 3; + id < SLOW_WORK_THREAD_LIMIT; + id++, (*_pos)++) + if (slow_work_execs[id]) + return (void *)(unsigned long) *_pos; + *_pos = 0x1UL << ITERATOR_SHIFT; + + case 0x1: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &slow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + + case 0x2: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &vslow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) +{ + spin_lock_irq(&slow_work_queue_lock); + return slow_work_runqueue_index(m, _pos); +} + +/* + * move to the next line + */ +static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) +{ + struct list_head *p = v; + unsigned long selector = *_pos >> ITERATOR_SHIFT; + + (*_pos)++; + switch (selector) { + case 0x0: + return slow_work_runqueue_index(m, _pos); + + case 0x1: + if (*_pos >> ITERATOR_SHIFT == 0x1) { + p = p->next; + if (p != &slow_work_queue) + return p; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + p = &vslow_work_queue; + + case 0x2: + if (*_pos >> ITERATOR_SHIFT == 0x2) { + p = p->next; + if (p != &vslow_work_queue) + return p; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * clean up after reading + */ +static void slow_work_runqueue_stop(struct seq_file *m, void *v) +{ + spin_unlock_irq(&slow_work_queue_lock); +} + +static const struct seq_operations slow_work_runqueue_ops = { + .start = slow_work_runqueue_start, + .stop = slow_work_runqueue_stop, + .next = slow_work_runqueue_next, + .show = slow_work_runqueue_show, +}; + +/* + * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents + */ +static int slow_work_runqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slow_work_runqueue_ops); +} + +const struct file_operations slow_work_runqueue_fops = { + .owner = THIS_MODULE, + .open = slow_work_runqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c deleted file mode 100644 index 3988032571f..00000000000 --- a/kernel/slow-work-proc.c +++ /dev/null @@ -1,227 +0,0 @@ -/* Slow work debugging - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include "slow-work.h" - -#define ITERATOR_SHIFT (BITS_PER_LONG - 4) -#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) -#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) - -void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) -{ - seq_puts(m, "Slow-work: New thread"); -} - -/* - * Render the time mark field on a work item into a 5-char time with units plus - * a space - */ -static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) -{ - struct timespec now, diff; - - now = CURRENT_TIME; - diff = timespec_sub(now, work->mark); - - if (diff.tv_sec < 0) - seq_puts(m, " -ve "); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) - seq_printf(m, "%3luns ", diff.tv_nsec); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) - seq_printf(m, "%3luus ", diff.tv_nsec / 1000); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) - seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); - else if (diff.tv_sec <= 1) - seq_puts(m, " 1s "); - else if (diff.tv_sec < 60) - seq_printf(m, "%4lus ", diff.tv_sec); - else if (diff.tv_sec < 60 * 60) - seq_printf(m, "%4lum ", diff.tv_sec / 60); - else if (diff.tv_sec < 60 * 60 * 24) - seq_printf(m, "%4luh ", diff.tv_sec / 3600); - else - seq_puts(m, "exces "); -} - -/* - * Describe a slow work item for /proc - */ -static int slow_work_runqueue_show(struct seq_file *m, void *v) -{ - struct slow_work *work; - struct list_head *p = v; - unsigned long id; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); - return 0; - case 2: - seq_puts(m, "=== ===== ================ == ===== ==========\n"); - return 0; - - case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: - id = (unsigned long) v - 3; - - read_lock(&slow_work_execs_lock); - work = slow_work_execs[id]; - if (work) { - smp_read_barrier_depends(); - - seq_printf(m, "%3lu %5d %16p %2lx ", - id, slow_work_pids[id], work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - } - read_unlock(&slow_work_execs_lock); - return 0; - - default: - work = list_entry(p, struct slow_work, link); - seq_printf(m, "%3s - %16p %2lx ", - work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", - work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - return 0; - } -} - -/* - * map the iterator to a work item - */ -static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) -{ - struct list_head *p; - unsigned long count, id; - - switch (*_pos >> ITERATOR_SHIFT) { - case 0x0: - if (*_pos == 0) - *_pos = 1; - if (*_pos < 3) - return (void *)(unsigned long) *_pos; - if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) - for (id = *_pos - 3; - id < SLOW_WORK_THREAD_LIMIT; - id++, (*_pos)++) - if (slow_work_execs[id]) - return (void *)(unsigned long) *_pos; - *_pos = 0x1UL << ITERATOR_SHIFT; - - case 0x1: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &slow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - - case 0x2: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &vslow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) -{ - spin_lock_irq(&slow_work_queue_lock); - return slow_work_runqueue_index(m, _pos); -} - -/* - * move to the next line - */ -static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) -{ - struct list_head *p = v; - unsigned long selector = *_pos >> ITERATOR_SHIFT; - - (*_pos)++; - switch (selector) { - case 0x0: - return slow_work_runqueue_index(m, _pos); - - case 0x1: - if (*_pos >> ITERATOR_SHIFT == 0x1) { - p = p->next; - if (p != &slow_work_queue) - return p; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - p = &vslow_work_queue; - - case 0x2: - if (*_pos >> ITERATOR_SHIFT == 0x2) { - p = p->next; - if (p != &vslow_work_queue) - return p; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * clean up after reading - */ -static void slow_work_runqueue_stop(struct seq_file *m, void *v) -{ - spin_unlock_irq(&slow_work_queue_lock); -} - -static const struct seq_operations slow_work_runqueue_ops = { - .start = slow_work_runqueue_start, - .stop = slow_work_runqueue_stop, - .next = slow_work_runqueue_next, - .show = slow_work_runqueue_show, -}; - -/* - * open "/proc/slow_work_rq" to list queue contents - */ -static int slow_work_runqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slow_work_runqueue_ops); -} - -const struct file_operations slow_work_runqueue_fops = { - .owner = THIS_MODULE, - .open = slow_work_runqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index b5c17f15f9d..00889bd3c59 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include "slow-work.h" static void slow_work_cull_timeout(unsigned long); @@ -138,7 +138,7 @@ static void slow_work_clear_thread_processing(int id) {} /* * Data for tracking currently executing items for indication through /proc */ -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; DEFINE_RWLOCK(slow_work_execs_lock); @@ -823,7 +823,7 @@ static void slow_work_new_thread_execute(struct slow_work *work) static const struct slow_work_ops slow_work_new_thread_ops = { .owner = THIS_MODULE, .execute = slow_work_new_thread_execute, -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG .desc = slow_work_new_thread_desc, #endif }; @@ -1055,9 +1055,15 @@ static int __init init_slow_work(void) if (slow_work_max_max_threads < nr_cpus * 2) slow_work_max_max_threads = nr_cpus * 2; #endif -#ifdef CONFIG_SLOW_WORK_PROC - proc_create("slow_work_rq", S_IFREG | 0400, NULL, - &slow_work_runqueue_fops); +#ifdef CONFIG_SLOW_WORK_DEBUG + { + struct dentry *dbdir; + + dbdir = debugfs_create_dir("slow_work", NULL); + if (dbdir && !IS_ERR(dbdir)) + debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, + NULL, &slow_work_runqueue_fops); + } #endif return 0; } diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 3c2f007f3ad..321f3c59d73 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h @@ -19,7 +19,7 @@ /* * slow-work.c */ -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG extern struct slow_work *slow_work_execs[]; extern pid_t slow_work_pids[]; extern rwlock_t slow_work_execs_lock; @@ -30,9 +30,9 @@ extern struct list_head vslow_work_queue; extern spinlock_t slow_work_queue_lock; /* - * slow-work-proc.c + * slow-work-debugfs.c */ -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG extern const struct file_operations slow_work_runqueue_fops; extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); -- cgit v1.2.3-70-g09d2