diff options
Diffstat (limited to 'kernel')
43 files changed, 4054 insertions, 1642 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c..cab535c427b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,7 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -56,10 +56,6 @@ #include <asm/mmu_context.h> #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..085f73ebcea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,6 @@ #include <linux/proc_fs.h> #include <linux/blkdev.h> #include <linux/fs_struct.h> -#include <trace/sched.h> #include <linux/magic.h> #include <asm/pgtable.h> @@ -71,6 +70,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <trace/events/sched.h> + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -83,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -DEFINE_TRACE(sched_process_fork); - int nr_processes(void) { int cpu; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 26e08754744..5dd2572993c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,8 +17,8 @@ #include <linux/kernel_stat.h> #include <linux/rculist.h> #include <linux/hash.h> -#include <trace/irq.h> #include <linux/bootmem.h> +#include <trace/events/irq.h> #include "internals.h" @@ -348,9 +348,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); - /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf8519ab..41c88fe4050 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #define KTHREAD_NICE_LEVEL (-5) @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); - struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index accb40cdb12..8bbeef996c7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> -#include <trace/lockdep.h> #include <asm/sections.h> #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include <trace/events/lockdep.h> + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); - /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); - void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { @@ -3105,6 +3103,8 @@ found_it: hlock->holdtime_stamp = now; } + trace_lock_acquired(lock, ip, waittime); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -3120,8 +3120,6 @@ found_it: lock->ip = ip; } -DEFINE_TRACE(lock_contended); - void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; @@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); - void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat)) return; diff --git a/kernel/module.c b/kernel/module.c index e797812a4d9..2383e60fcf3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ */ #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/ftrace_event.h> #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/fs.h> @@ -1489,9 +1490,6 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); - /* release any pointers to mcount in this module */ - ftrace_release(mod->module_core, mod->core_size); - /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1892,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int num_mcount; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2172,7 +2168,19 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif - +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(hdr, sechdrs, secstrings, + "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, + "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2237,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod, dynamic_debug_setup(debug, num_debug); } - /* sechdrs[0].sh_size is always zero */ - mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", - sizeof(*mseg), &num_mcount); - ftrace_init_module(mod, mseg, mseg + num_mcount); - err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2302,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); - ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc..14a19b17674 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,13 +72,15 @@ #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> -#include <trace/sched.h> #include <asm/tlb.h> #include <asm/irq_regs.h> #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1958,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..94ec0a4dde0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -41,8 +41,6 @@ static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); - static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd34851..dc4d0cfdcb2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,9 @@ #include <linux/ftrace.h> #include <linux/smp.h> #include <linux/tick.h> -#include <trace/irq.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/irq.h> #include <asm/irq.h> /* @@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); - asmlinkage void __do_softirq(void) { struct softirq_action *h; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 417d1985e29..50f62a296e1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER depends on HAVE_FTRACE_NMI_ENTER default y +config EVENT_TRACING + bool + config TRACING bool select DEBUG_FS @@ -56,6 +59,7 @@ config TRACING select TRACEPOINTS select NOP_TRACER select BINARY_PRINTF + select EVENT_TRACING # # Minimum requirements an architecture has to meet for us to @@ -73,7 +77,12 @@ config TRACING_SUPPORT if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE + bool "Tracers" + help + Enable the kernel tracing infrastructure. + +if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" @@ -104,6 +113,7 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -173,7 +183,7 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. -config EVENT_TRACER +config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" select TRACING help @@ -181,6 +191,10 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. + Note, all tracers enable event tracing. This option is + only a convenience to enable event tracing when no other + tracers are selected. + config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS @@ -207,8 +221,36 @@ config BOOT_TRACER to enable this on bootup. config TRACE_BRANCH_PROFILING - bool "Trace likely/unlikely profiler" + bool select TRACING + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if statement in the + kernel. This profiler will also enable the likely/unlikely + profiler as well. + + Either of the above profilers add a bit of overhead to the system. + If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING help This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: @@ -218,11 +260,9 @@ config TRACE_BRANCH_PROFILING Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. - Say N if unsure. - config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" - depends on TRACE_BRANCH_PROFILING + select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -230,11 +270,12 @@ config PROFILE_ALL_BRANCHES /debugfs/tracing/profile_branch + This option also enables the likely/unlikely profiler. + This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system is to be analyzed - - Say N if unsure. +endchoice config TRACING_BRANCHES bool @@ -375,6 +416,20 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on FUNCTION_TRACER + default n + help + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A file in the trace_stats + directory called functions, that show the list of functions that + have been hit and their counters. + + If in doubt, say N + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE @@ -416,7 +471,23 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. -endmenu +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and bench mark it. + It creates its own ring buffer such that it will not interfer with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N + +endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..7c34cbfff96 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ endif obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o @@ -40,11 +41,10 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_EVENT_TRACING) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..e099f8cc1d1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; - if (sector < bt->start_lba || sector > bt->end_lba) + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; @@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, DISCARD); pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) + if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); @@ -403,11 +403,29 @@ static struct rchan_callbacks blk_relay_callbacks = { .remove_buf_file = blk_remove_buf_file_callback, }; +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + /* * Setup everything required to start tracing */ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; @@ -480,10 +498,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) - bt->end_lba = -1ULL; + blk_trace_setup_lba(bt, bdev); + + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; bt->pid = buts->pid; bt->trace_state = Blktrace_setup; @@ -505,6 +526,7 @@ err: } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; @@ -514,7 +536,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, &buts); + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; @@ -582,7 +604,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; case BLKTRACESTART: start = 1; @@ -809,7 +831,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @bio: the source bio * @dev: target device * @from: source sector - * @to: target sector * * Description: * Device mapper or raid target sometimes need to split a bio because @@ -817,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -825,12 +846,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); } /** @@ -971,6 +993,16 @@ static inline const void *pdu_start(const struct trace_entry *ent) return te_blk_io_trace(ent) + 1; } +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; @@ -996,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent, struct blk_io_trace_remap *r) { const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; + __u64 sector_from = __r->sector_from; - r->device = be32_to_cpu(__r->device); r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); } typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1031,36 +1063,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) MAJOR(t->device), MINOR(t->device), act, rwbs); } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } } static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } } static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { - struct blk_io_trace_remap r = { .device = 0, }; + struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) @@ -1117,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1130,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; - trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) @@ -1182,7 +1274,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); + ret = trace_seq_printf(s, "Unknown action %x\n", what); else { ret = log_action(iter, what2act[what].act[long_act]); if (ret) @@ -1195,9 +1287,6 @@ out: static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags) { - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - return print_one_line(iter, false); } @@ -1232,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, @@ -1241,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = { .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, }; static struct trace_event trace_blk_event = { @@ -1285,7 +1387,8 @@ static int blk_trace_remove_queue(struct request_queue *q) /* * Setup everything required to start tracing */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) { struct blk_trace *old_bt, *bt = NULL; int ret = -ENOMEM; @@ -1298,9 +1401,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (!bt->msg_data) goto free_bt; - bt->dev = dev; + bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; + + blk_trace_setup_lba(bt, bdev); old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { @@ -1517,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_enable) { if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; @@ -1525,7 +1629,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = 0; if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) @@ -1548,3 +1652,8 @@ out: return ret ? ret : count; } +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 246f2aa6dc4..00000000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include <linux/stringify.h> - -#include <trace/trace_events.h> - -#include "trace_output.h" - -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ed080406c..5b606f45b6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,11 +29,12 @@ #include <linux/list.h> #include <linux/hash.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include <asm/ftrace.h> -#include "trace.h" +#include "trace_output.h" +#include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ do { \ @@ -68,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, + .func = ftrace_stub, }; static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -240,6 +241,576 @@ static void ftrace_update_pid_func(void) #endif } +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; +#endif +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} +#endif + +static int function_stat_headers(struct seq_file *m) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function " + "Hit Time Avg\n" + " -------- " + "--- ---- ---\n"); +#else + seq_printf(m, " Function Hit\n" + " -------- ---\n"); +#endif + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static DEFINE_MUTEX(mutex); + static struct trace_seq s; + unsigned long long avg; +#endif + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); + trace_print_seq(m, &s); + mutex_unlock(&mutex); +#endif + seq_putc(m, '\n'); + + return 0; +} + +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + + pg = stat->pages = stat->start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(stat->hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + int functions; + int pages; + int i; + + /* If we already allocated, do nothing */ + if (stat->pages) + return 0; + + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) + return -ENOMEM; + +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + + pg = stat->start = stat->pages; + + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!pg->next) + goto out_free; + pg = pg->next; + } + + return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; +} + +static int ftrace_profile_init_cpu(int cpu) +{ + struct ftrace_profile_stat *stat; + int size; + + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(stat); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!stat->hash) + return -ENOMEM; + + if (!ftrace_profile_bits) { + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + } + + /* Preallocate the function profiling pages */ + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; + return -ENOMEM; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + +/* interrupts must be disabled */ +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &stat->hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); +} + +/* + * The memory is already allocated, this simply finds a new record to use. + */ +static struct ftrace_profile * +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion (from NMIs) */ + if (atomic_inc_return(&stat->disabled) != 1) + goto out; + + /* + * Try to find the function again since an NMI + * could have added it + */ + rec = ftrace_find_profiled_func(stat, ip); + if (rec) + goto out; + + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; + } + + rec = &stat->pages->records[stat->pages->index++]; + rec->ip = ip; + ftrace_add_profile(stat, rec); + + out: + atomic_dec(&stat->disabled); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_profile_stat *stat; + struct ftrace_profile *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); + if (!rec) { + rec = ftrace_profile_alloc(stat, ip); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + struct ftrace_profile_stat *stat; + unsigned long long calltime; + struct ftrace_profile *rec; + unsigned long flags; + + local_irq_save(flags); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + + rec = ftrace_find_profiled_func(stat, trace->func); + if (rec) + rec->time += calltime; + + out: + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; /* big enough to hold a number */ + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_profiler(); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; /* big enough to hold a number */ + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct ftrace_profile_stat *stat; + struct dentry *entry; + char *name; + int ret; + int cpu; + + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + kfree(stat); + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + /* set when tracing only a pid */ struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; @@ -261,7 +832,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -346,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -void ftrace_release(void *start, unsigned long size) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = s + size; - - if (ftrace_disabled || !start) - return; - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { - /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. - */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -1408,7 +1954,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_probe_call, + .func = function_trace_probe_call, }; static int ftrace_probe_registered; @@ -2128,38 +2674,23 @@ static const struct file_operations ftrace_graph_fops = { static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); + trace_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + trace_create_file("set_ftrace_notrace", 0644, d_tracer, NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -2197,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod, return 0; } -void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = (unsigned long)end; + + if (ftrace_disabled || !start || start == end) + return; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; ftrace_convert_nops(mod, start, end); } +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release(mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -2236,6 +2825,10 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + ret = register_module_notifier(&ftrace_module_nb); + if (!ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + return; failed: ftrace_disabled = 1; @@ -2417,7 +3010,6 @@ static const struct file_operations ftrace_pid_fops = { static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2425,11 +3017,11 @@ static __init int ftrace_init_debugfs(void) ftrace_init_dyn_debugfs(d_tracer); - entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_pid' entry\n"); + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + + ftrace_profile_debugfs(d_tracer); + return 0; } fs_initcall(ftrace_init_debugfs); @@ -2538,7 +3130,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active; static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -2690,7 +3282,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); /* we currently allow only one tracer registered at a time */ - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { ret = -EBUSY; goto out; } @@ -2698,10 +3290,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); - atomic_inc(&ftrace_graph_active); + ftrace_graph_active++; ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; goto out; } @@ -2719,10 +3311,10 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); - if (!unlikely(atomic_read(&ftrace_graph_active))) + if (unlikely(!ftrace_graph_active)) goto out; - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; @@ -2736,7 +3328,7 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d91e3..86cdf671d7e 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@ #include <linux/dcache.h> #include <linux/fs.h> -#include <trace/kmemtrace.h> +#include <linux/kmemtrace.h> #include "trace_output.h" #include "trace.h" @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, gfp_t gfp_flags, int node) { + struct ftrace_event_call *call = &event_kmem_alloc; struct trace_array *tr = kmemtrace_array; struct kmemtrace_alloc_entry *entry; struct ring_buffer_event *event; @@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, unsigned long call_site, const void *ptr) { + struct ftrace_event_call *call = &event_kmem_free; struct trace_array *tr = kmemtrace_array; struct kmemtrace_free_entry *entry; struct ring_buffer_event *event; @@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c84..3ae5ccf2c0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,28 @@ #include "trace.h" /* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + + return ret; +} + +/* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read @@ -182,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, @@ -191,48 +216,28 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } static inline int rb_discarded_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; - if (event->len) - length = event->len * RB_ALIGNMENT; + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -242,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event) static unsigned rb_event_length(struct ring_buffer_event *event) { - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; - return rb_event_data_length(event); + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -271,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -284,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -316,9 +321,10 @@ struct buffer_data_page { }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -361,6 +367,28 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -375,8 +403,11 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -947,31 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) - continue; - cpu_buffer->overrun++; - cpu_buffer->entries--; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -1110,28 +1116,21 @@ static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); @@ -1155,131 +1154,156 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } + static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - unsigned long tail, write; + struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; bool lock_taken = false; + unsigned long flags; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; + next_page = tail_page; - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; + } + } else + __raw_spin_lock(&cpu_buffer->lock); - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) - goto out_reset; - } else - __raw_spin_lock(&cpu_buffer->lock); + lock_taken = true; - lock_taken = true; + rb_inc_page(cpu_buffer, &next_page); - rb_inc_page(cpu_buffer, &next_page); + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; + } - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) goto out_reset; - } - - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - rb_update_overflow(cpu_buffer); + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; } + } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; - } + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + rb_event_set_padding(event); + } - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + + out_reset: + /* reset write */ + local_sub(length, &tail_page->write); + if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + local_irq_restore(flags); + return NULL; +} - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); - } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); /* We reserved something on the buffer */ @@ -1289,6 +1313,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* * If this is a commit and the tail is zero, then update * this page's time stamp. @@ -1297,16 +1325,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->commit_page->page->time_stamp = *ts; return event; - - out_reset: - /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); - - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); - return NULL; } static int @@ -1458,6 +1476,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +#define TRACE_RECURSIVE_DEPTH 16 + +static int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + + WARN_ON_ONCE(1); + return -1; +} + +static void trace_recursive_unlock(void) +{ + WARN_ON_ONCE(!current->trace_recursion); + + current->trace_recursion--; +} + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1491,6 +1539,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1505,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, 0, length); if (!event) goto out; @@ -1520,6 +1571,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1528,7 +1582,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); /* Only process further if we own the commit */ if (!rb_is_commit(cpu_buffer, event)) @@ -1558,6 +1612,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ @@ -1570,6 +1626,119 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + rb_event_discard(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + int cpu; + + /* The event is discarded regardless */ + rb_event_discard(event); + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !preempt_count()); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + goto out; + } + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + local_inc(&cpu_buffer->entries); + out: + /* + * If a write came in and pushed the tail page + * we still need to update the commit pointer + * if we were the commit. + */ + if (rb_is_commit(cpu_buffer, event)) + rb_set_commit_to_write(cpu_buffer); + + trace_recursive_unlock(); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. @@ -1613,8 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + event = rb_reserve_next_event(cpu_buffer, 0, event_length); if (!event) goto out; @@ -1728,7 +1896,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; return ret; } @@ -1755,6 +1924,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * @@ -1770,7 +1980,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -1862,7 +2073,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -1893,7 +2104,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -1966,6 +2177,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -2008,8 +2220,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) - cpu_buffer->entries--; + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2089,7 +2302,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); @@ -2161,7 +2374,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); @@ -2220,7 +2433,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2248,7 +2461,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2293,7 +2506,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2386,7 +2599,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2411,6 +2624,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2420,11 +2634,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2443,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2452,6 +2672,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2578,28 +2800,6 @@ out: } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage, - unsigned int offset) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = offset; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) - continue; - cpu_buffer->entries--; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2630,6 +2830,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2642,6 +2843,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer @@ -2768,16 +2970,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* we copied everything to the beginning */ read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); + local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; - - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); } ret = read; @@ -2787,6 +2990,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, @@ -2845,14 +3049,11 @@ static const struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 00000000000..a26fc67b63b --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,413 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/completion.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/time.h> + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + commit = local_read(&rpage->commit); + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* We don't expect any padding */ + KILL_TEST(); + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0]; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +/* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call cond_resched + * and also add any time that was lost by a rescedule. + */ +#ifdef CONFIG_PREEMPT +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ +} +#else +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ + struct timeval tv; + + cond_resched(); + do_gettimeofday(&tv); + if (tv.tv_usec < end_tv->tv_usec) { + tv.tv_usec += 1000000; + tv.tv_sec--; + } + start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; + start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; + if (start_tv->tv_usec > 1000000) { + start_tv->tv_usec -= 1000000; + start_tv->tv_sec++; + } +} +#endif + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + pr_info("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + do_gettimeofday(&end_tv); + + if (consumer && !(++cnt % wakeup_interval)) + wake_up_process(consumer); + + sched_if_needed(&start_tv, &end_tv); + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + pr_info("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= 1000000; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + pr_info("ERROR!\n"); + pr_info("Time: %lld (usecs)\n", time); + pr_info("Overruns: %lld\n", overruns); + if (disable_reader) + pr_info("Read: (reader disabled)\n"); + else + pr_info("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + pr_info("Entries: %lld\n", entries); + pr_info("Total: %lld\n", entries + overruns + read); + pr_info("Missed: %ld\n", missed); + pr_info("Hit: %ld\n", hit); + + do_div(time, 1000); + if (time) + hit /= (long)time; + else + pr_info("TIME IS ZERO??\n"); + + pr_info("Entries per millisec: %ld\n", hit); + + if (hit) { + avg = 1000000 / hit; + pr_info("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + pr_info("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a884c09006c..dd40d232034 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,13 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, global_trace.buffer, event); +} +EXPORT_SYMBOL_GPL(filter_current_check_discard); + cycle_t ftrace_now(int cpu) { u64 ts; @@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +325,7 @@ static const char *trace_options[] = { "latency-format", "global-clock", "sleep-time", + "graph-time", NULL }; @@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_init(s); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr) tracing_reset(tr, cpu); } +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + #define SAVED_CMDLINES 128 #define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -840,7 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc) { @@ -883,30 +891,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr, } struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc) { return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(global_trace.buffer, event); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_function; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -921,7 +939,9 @@ trace_function(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - ring_buffer_unlock_commit(tr->buffer, event); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -930,6 +950,7 @@ static int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; @@ -942,7 +963,8 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); return 1; } @@ -952,6 +974,7 @@ static void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; @@ -964,7 +987,8 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -982,6 +1006,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, int skip, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -999,7 +1024,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1024,6 +1050,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1045,7 +1072,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1089,6 +1117,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_context_switch; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1104,7 +1133,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); - trace_buffer_unlock_commit(tr, event, flags, pc); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1113,6 +1144,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1129,7 +1161,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1230,11 +1263,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; + int disable; int resched; int cpu, len = 0, size, pc; @@ -1249,7 +1284,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; /* Lockdep uses trace_printk for lock tracing */ @@ -1269,13 +1305,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); local_irq_restore(flags); out: + atomic_dec_return(&data->disabled); ftrace_preempt_enable(resched); unpause_graph_tracing(); @@ -1288,12 +1326,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; int cpu, len = 0, size, pc; struct print_entry *entry; unsigned long irq_flags; + int disable; if (tracing_disabled || tracing_selftest_running) return 0; @@ -1303,7 +1343,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; pause_graph_tracing(); @@ -1323,13 +1364,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: + atomic_dec_return(&data->disabled); preempt_enable_notrace(); return len; @@ -2397,6 +2440,56 @@ static const struct file_operations tracing_readme_fops = { }; static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, +}; + +static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3425,7 +3518,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int size, i; + int entries, size, i; size_t ret; if (*ppos & (PAGE_SIZE - 1)) { @@ -3440,7 +3533,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3457,7 +3552,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 0); + len, info->cpu, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); @@ -3481,6 +3576,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } spd.nr_pages = i; @@ -3508,6 +3605,45 @@ static const struct file_operations tracing_buffers_fops = { .llseek = no_llseek, }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_ATOMIC); + if (!s) + return ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); + trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3597,7 +3733,7 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry, *d_cpu; + struct dentry *d_cpu; /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ char cpu_dir[7]; @@ -3612,21 +3748,18 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - entry = debugfs_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe' entry\n"); + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); /* per cpu trace */ - entry = debugfs_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); - entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); + + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3782,6 +3915,22 @@ static const struct file_operations trace_options_core_fops = { .write = trace_options_core_write, }; +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + static struct dentry *trace_options_init_dentry(void) { struct dentry *d_tracer; @@ -3809,7 +3958,6 @@ create_trace_option_file(struct trace_option_dentry *topt, struct tracer_opt *opt) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) @@ -3818,11 +3966,9 @@ create_trace_option_file(struct trace_option_dentry *topt, topt->flags = flags; topt->opt = opt; - entry = debugfs_create_file(opt->name, 0644, t_options, topt, + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); - topt->entry = entry; - } static struct trace_option_dentry * @@ -3877,123 +4023,84 @@ static struct dentry * create_trace_option_core_file(const char *option, long index) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) return NULL; - entry = debugfs_create_file(option, 0644, t_options, (void *)index, + return trace_create_file(option, 0644, t_options, (void *)index, &trace_options_core_fops); - - return entry; } static __init void create_trace_options_dir(void) { struct dentry *t_options; - struct dentry *entry; int i; t_options = trace_options_init_dentry(); if (!t_options) return; - for (i = 0; trace_options[i]; i++) { - entry = create_trace_option_core_file(trace_options[i], i); - if (!entry) - pr_warning("Could not create debugfs %s entry\n", - trace_options[i]); - } + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; int cpu; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); - entry = debugfs_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_options' entry\n"); + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); - create_trace_options_dir(); + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - - entry = debugfs_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'available_tracers' entry\n"); - - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'current_tracer' entry\n"); - - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); - - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_thresh' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); - - entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("current_tracer", 0644, d_tracer, + &global_trace, &set_tracer_fops); + + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); + + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + + trace_create_file("README", 0444, d_tracer, + NULL, &tracing_readme_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_pipe' entry\n"); - - entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'buffer_size_kb' entry\n"); - - entry = debugfs_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_marker' entry\n"); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); + + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); #ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_dyn_info_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + create_trace_options_dir(); + for_each_tracing_cpu(cpu) tracing_init_debugfs_percpu(cpu); @@ -4064,7 +4171,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static DEFINE_SPINLOCK(ftrace_dump_lock); + static raw_spinlock_t ftrace_dump_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4073,7 +4181,8 @@ static void __ftrace_dump(bool disable_tracing) int cnt = 0, cpu; /* only one dump */ - spin_lock_irqsave(&ftrace_dump_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4145,7 +4254,8 @@ static void __ftrace_dump(bool disable_tracing) } out: - spin_unlock_irqrestore(&ftrace_dump_lock, flags); + __raw_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); } /* By default: disable tracing after the dump */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e685ac2b2ba..ba25793ffe6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,9 +9,12 @@ #include <linux/mmiotrace.h> #include <linux/ftrace.h> #include <trace/boot.h> -#include <trace/kmemtrace.h> +#include <linux/kmemtrace.h> #include <trace/power.h> +#include <linux/trace_seq.h> +#include <linux/ftrace_event.h> + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -42,20 +45,6 @@ enum trace_type { }; /* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned char type; - unsigned char flags; - unsigned char preempt_count; - int pid; - int tgid; -}; - -/* * Function trace entry - function address and parent function addres: */ struct ftrace_entry { @@ -263,8 +252,6 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; -struct trace_iterator; - /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void); __ftrace_bad_type(); \ } while (0) -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - - /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the @@ -423,60 +401,30 @@ struct tracer { struct tracer_stat *stats; }; -struct trace_seq { - unsigned char buffer[PAGE_SIZE]; - unsigned int len; - unsigned int readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - #define TRACE_PIPE_ALL_CPU -1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - int cpu; - u64 ts; - - unsigned long iter_flags; - loff_t pos; - long idx; - - cpumask_var_t started; -}; - int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc); @@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); - struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, @@ -613,6 +552,8 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -644,7 +585,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ - #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) @@ -692,6 +632,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, TRACE_ITER_SLEEP_TIME = 0x100000, + TRACE_ITER_GRAPH_TIME = 0x200000, }; /* @@ -790,103 +731,113 @@ struct ftrace_event_field { char *type; int offset; int size; + int is_signed; }; -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); - struct list_head fields; +struct event_filter { + int n_preds; struct filter_pred **preds; - -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif + char *filter_string; }; struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - struct filter_pred **preds; + void *filter; }; -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - -#define MAX_FILTER_PRED 8 - struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, + int val1, int val2); struct filter_pred { filter_pred_fn_t fn; u64 val; - char *str_val; + char str_val[MAX_FILTER_STR_VAL]; int str_len; char *field_name; int offset; int not; - int or; - int compound; - int clear; + int op; + int pop_n; }; -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred); - -void event_trace_printk(unsigned long ip, const char *fmt, ...); -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s); + +static inline int +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} + +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + +extern struct mutex event_mutex; +extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) +#include "trace_event_types.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4c364..a29ef23ffb4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/kallsyms.h> +#include <linux/time.h> #include "trace.h" #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); call = &field->boot_call; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", (unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); init_ret = &field->boot_ret; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8333715e406..7a7a9fd249a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { + struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; @@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); @@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v) return 0; } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace) { return __start_annotated_branch_profile; } @@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; } diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba997077..5b5895afecf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,22 +10,30 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; + int ret = -EINVAL; - for_each_event(event) { - if (event->id == event_id) - return event->profile_enable(event); + mutex_lock(&event_mutex); + list_for_each_entry(event, &ftrace_events, list) { + if (event->id == event_id) { + ret = event->profile_enable(event); + break; + } } + mutex_unlock(&event_mutex); - return -EINVAL; + return ret; } void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { - if (event->id == event_id) - return event->profile_disable(event); + mutex_lock(&event_mutex); + list_for_each_entry(event, &ftrace_events, list) { + if (event->id == event_id) { + event->profile_disable(event); + break; + } } + mutex_unlock(&event_mutex); } - diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee71dd..5e32e375134 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, arg1, arg1) TRACE_FIELD(unsigned long, arg2, arg2) @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, + TRACE_FUNC_SIZE+1, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, + TRACE_FUNC_SIZE+1, file) TRACE_FIELD(char, correct, correct) ), TP_RAW_FMT("%u:%s:%s (%u)") @@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_STRUCT( - TRACE_FIELD(ktime_t, state_data.stamp, stamp) - TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) + TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 576f4fa2af0..87feb0117ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,19 +8,25 @@ * */ +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/kthread.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/delay.h> #include "trace_output.h" #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); + +LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size) + char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type, field->offset = offset; field->size = size; + field->is_signed = is_signed; list_add(&field->link, &call->fields); return 0; @@ -51,20 +58,37 @@ err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(trace_define_field); -static void ftrace_clear_events(void) +#ifdef CONFIG_MODULES + +static void trace_destroy_fields(struct ftrace_event_call *call) { - struct ftrace_event_call *call = (void *)__start_ftrace_events; + struct ftrace_event_field *field, *next; + list_for_each_entry_safe(field, next, &call->fields, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); + } +} + +#endif /* CONFIG_MODULES */ + +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { call->enabled = 0; call->unregfunc(); } - call++; } + mutex_unlock(&event_mutex); } static void ftrace_event_enable_disable(struct ftrace_event_call *call, @@ -89,7 +113,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; char *event = NULL, *sub = NULL, *match; int ret = -EINVAL; @@ -118,7 +142,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - for_each_event(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; @@ -224,15 +248,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next = call; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; for (;;) { - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. @@ -240,45 +266,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (call->regfunc) break; - call++; - next = call; + list = list->next; } - m->private = ++next; + m->private = list->next; return call; } static void *t_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return t_next(m, NULL, pos); } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; retry: - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + if (!call->enabled) { - call++; + list = list->next; goto retry; } - next = call; - m->private = ++next; + m->private = list->next; return call; } static void *s_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return s_next(m, NULL, pos); } @@ -295,12 +327,12 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static int ftrace_event_seq_open(struct inode *inode, struct file *file) { - int ret; const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && @@ -308,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) ftrace_clear_events(); seq_ops = inode->i_private; - ret = seq_open(file, seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start_ftrace_events; - } - return ret; + return seq_open(file, seq_ops); } static ssize_t @@ -374,8 +400,138 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = -1; + int all = 0; + int ret; + + if (system[0] == '*') + all = 1; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->regfunc) + continue; + + if (!all && strcmp(call->system, system) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + if (call->enabled) { + switch (set) { + case -1: + set = 1; + break; + case 0: + set = 2; + break; + } + } else { + switch (set) { + case -1: + set = 0; + break; + case 1: + set = 2; + break; + } + } + /* + * If we have a mixture, no need to look further. + */ + if (set == 2) + break; + } + mutex_unlock(&event_mutex); + + buf[1] = '\n'; + switch (set) { + case 0: + buf[0] = '0'; + break; + case 1: + buf[0] = '1'; + break; + case 2: + buf[0] = 'X'; + break; + default: + buf[0] = '?'; + } + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + unsigned long val; + char *command; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + break; + + default: + return -EINVAL; + } + + command = kstrdup(system, GFP_KERNEL); + if (!command) + return -ENOMEM; + + ret = ftrace_set_clr_event(command, val); + if (ret) + goto out_free; + + ret = cnt; + + out_free: + kfree(command); + + *ppos += cnt; + + return ret; +} + +extern char *__bad_type_size(void); + #undef FIELD #define FIELD(type, name) \ + sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name) @@ -391,7 +547,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(unsigned char, type), + FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), @@ -481,7 +637,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, s); + print_event_filter(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -494,38 +650,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_free_preds(call); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_pred(call, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; @@ -549,7 +693,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, s); + print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -562,45 +706,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct event_subsystem *system = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_subsystem_pred(system, pred); - if (err < 0) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; return cnt; } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -658,6 +813,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_system_enable_fops = { + .open = tracing_open_generic, + .read = system_enable_read, + .write = system_enable_write, +}; + +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -684,6 +850,7 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -707,16 +874,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } - system->name = name; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + list_add(&system->list, &event_subsystems); - system->preds = NULL; + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) { + kfree(system->filter); + system->filter = NULL; + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + } + + entry = trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *entry; int ret; @@ -725,7 +922,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, "TRACE_SYSTEM") != 0) + if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { @@ -744,21 +941,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - if (call->regfunc) { - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); - } + if (call->regfunc) + entry = trace_create_file("enable", 0644, call->dir, call, + enable); - if (call->id) { - entry = debugfs_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); - if (!entry) - pr_warning("Could not create debugfs '%s/id' entry\n", - call->name); - } + if (call->id) + entry = trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(); @@ -767,32 +956,196 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); + entry = trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = debugfs_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/format' entry\n", call->name); + entry = trace_create_file("format", 0444, call->dir, call, + format); + + return 0; +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops = NULL; + struct ftrace_event_call *call, *start, *end; + struct dentry *d_events; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + d_events = event_trace_events_dir(); + if (!d_events) + return; + + for_each_event(call, start, end) { + /* The linker may leave blanks */ + if (!call->name) + continue; + + /* + * This module has events, create file ops for this module + * if not already done. + */ + if (!file_ops) { + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + } + call->mod = mod; + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + struct ftrace_event_call *call, *p; + bool found = false; + + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + found = true; + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + if (call->event) + unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + } + } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); + + return 0; +} +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ return 0; } +#endif /* CONFIG_MODULES */ + +struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; static __init int event_trace_init(void) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; + int ret; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -816,13 +1169,272 @@ static __init int event_trace_init(void) if (!d_events) return 0; - for_each_event(call) { + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + "*:*", &ftrace_system_enable_fops); + + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) continue; - event_create_dir(call, d_events); + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); } + ret = register_module_notifier(&trace_module_nb); + if (!ret) + pr_warning("Failed to register trace events module notifier\n"); + return 0; } fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init void event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + char *sysname; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a regfunc */ + if (!call->regfunc) + continue; + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->enabled) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + call->enabled = 1; + call->regfunc(); + + event_test_stuff(); + + call->unregfunc(); + call->enabled = 0; + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + /* ftrace_set_clr_event can modify the name passed in. */ + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return; + } + ret = ftrace_set_clr_event(sysname, 1); + kfree(sysname); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return; + } + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + sysname = kmalloc(4, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return; + } + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 1); + if (WARN_ON_ONCE(ret)) { + kfree(sysname); + pr_warning("error enabling all events\n"); + return; + } + + event_test_stuff(); + + /* reset sysname */ + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return; + } + + pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int resched; + int cpu; + int pc; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_nowake_buffer_unlock_commit(event, flags, pc); + + out: + atomic_dec(&per_cpu(test_event_disable, cpu)); + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + register_ftrace_function(&trace_ops); + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + + event_trace_self_tests(); + + event_trace_self_test_with_function(); + + return 0; +} + +late_initcall(event_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e03cbf1e38f..85ad6a8939a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,119 +22,297 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/mutex.h> #include "trace.h" #include "trace_output.h" -static int filter_pred_64(struct filter_pred *pred, void *event) +static DEFINE_MUTEX(filter_mutex); + +enum filter_op_ids +{ + OP_OR, + OP_AND, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; + +struct filter_op { + int id; + char *string; + int precedence; +}; + +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) +{ + return val1 && val2; +} + +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) +{ + return val1 || val2; +} + +/* Filter predicate for fixed sized arrays of characters */ +static int filter_pred_string(struct filter_pred *pred, void *event, + int val1, int val2) { - u64 *addr = (u64 *)(event + pred->offset); - u64 val = (u64)pred->val; - int match; + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); - match = (val == *addr) ^ pred->not; + match = (!cmp) ^ pred->not; return match; } -static int filter_pred_32(struct filter_pred *pred, void *event) +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, + int val1, int val2) { - u32 *addr = (u32 *)(event + pred->offset); - u32 val = (u32)pred->val; - int match; + int str_loc = *(int *)(event + pred->offset); + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); - match = (val == *addr) ^ pred->not; + match = (!cmp) ^ pred->not; return match; } -static int filter_pred_16(struct filter_pred *pred, void *event) +static int filter_pred_none(struct filter_pred *pred, void *event, + int val1, int val2) +{ + return 0; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct ftrace_event_call *call, void *rec) { - u16 *addr = (u16 *)(event + pred->offset); - u16 val = (u16)pred->val; - int match; + struct event_filter *filter = call->filter; + int match, top = 0, val1 = 0, val2 = 0; + int stack[MAX_FILTER_PRED]; + struct filter_pred *pred; + int i; + + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; + if (!pred->pop_n) { + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; + continue; + } + if (pred->pop_n > top) { + WARN_ON_ONCE(1); + return 0; + } + val1 = stack[--top]; + val2 = stack[--top]; + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; + } - match = (val == *addr) ^ pred->not; + return stack[--top]; +} +EXPORT_SYMBOL_GPL(filter_match_preds); - return match; +static void parse_error(struct filter_parse_state *ps, int err, int pos) +{ + ps->lasterr = err; + ps->lasterr_pos = pos; } -static int filter_pred_8(struct filter_pred *pred, void *event) +static void remove_filter_string(struct event_filter *filter) { - u8 *addr = (u8 *)(event + pred->offset); - u8 val = (u8)pred->val; - int match; + kfree(filter->filter_string); + filter->filter_string = NULL; +} - match = (val == *addr) ^ pred->not; +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; - return match; + return 0; } -static int filter_pred_string(struct filter_pred *pred, void *event) +static int append_filter_string(struct event_filter *filter, + char *string) { - char *addr = (char *)(event + pred->offset); - int cmp, match; + int newlen; + char *new_filter_string; - cmp = strncmp(addr, pred->str_val, pred->str_len); + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; - match = (!cmp) ^ pred->not; + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; - return match; + return 0; } -/* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) { - int i, matched, and_failed = 0; - struct filter_pred *pred; + int pos = ps->lasterr_pos; + char *buf, *pbuf; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) { - pred = call->preds[i]; - if (and_failed && !pred->or) - continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; - } else - break; - } + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return; - if (and_failed) - return 0; + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; - return 1; + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - char *field_name; - struct filter_pred *pred; - int i; + struct event_filter *filter = call->filter; - if (!preds) { + mutex_lock(&filter_mutex); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else trace_seq_printf(s, "none\n"); - return; - } + mutex_unlock(&filter_mutex); +} - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (preds[i]) { - pred = preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_val) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } else - break; - } +void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s) +{ + struct event_filter *filter = system->filter; + + mutex_lock(&filter_mutex); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); + mutex_unlock(&filter_mutex); } static struct ftrace_event_field * @@ -150,284 +328,826 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred) { if (!pred) return; kfree(pred->field_name); - kfree(pred->str_val); kfree(pred); } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred) { - int i; + kfree(pred->field_name); + pred->field_name = NULL; + pred->str_len = 0; +} - if (call->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) - filter_free_pred(call->preds[i]); - kfree(call->preds); - call->preds = NULL; +static int filter_set_pred(struct filter_pred *dest, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + *dest = *src; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; } + dest->fn = fn; + + return 0; } -void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_disable_preds(struct ftrace_event_call *call) { - struct ftrace_event_call *call = __start_ftrace_events; + struct event_filter *filter = call->filter; int i; - if (system->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) - filter_free_pred(system->preds[i]); - kfree(system->preds); - system->preds = NULL; - } + call->filter_active = 0; + filter->n_preds = 0; - events_for_each(call) { - if (!call->name || !call->regfunc) - continue; + for (i = 0; i < MAX_FILTER_PRED; i++) + filter->preds[i]->fn = filter_pred_none; +} + +void destroy_preds(struct ftrace_event_call *call) +{ + struct event_filter *filter = call->filter; + int i; - if (!strcmp(call->system, system->name)) - filter_free_preds(call); + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); } + kfree(filter->preds); + kfree(filter); + call->filter = NULL; } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) +int init_preds(struct ftrace_event_call *call) { + struct event_filter *filter; + struct filter_pred *pred; int i; - if (call->preds && !pred->compound) - filter_free_preds(call); + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!call->filter) + return -ENOMEM; - if (!call->preds) { - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!call->preds) - return -ENOMEM; - } + call->filter_active = 0; + filter->n_preds = 0; + + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!filter->preds) + goto oom; for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!call->preds[i]) { - call->preds[i] = pred; - return 0; + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + goto oom; + pred->fn = filter_pred_none; + filter->preds[i] = pred; + } + + return 0; + +oom: + destroy_preds(call); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(init_preds); + +static void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + struct ftrace_event_call *call; + int i; + + if (filter->n_preds) { + for (i = 0; i < filter->n_preds; i++) + filter_free_pred(filter->preds[i]); + kfree(filter->preds); + filter->preds = NULL; + filter->n_preds = 0; + } + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->define_fields) + continue; + + if (!strcmp(call->system, system->name)) { + filter_disable_preds(call); + remove_filter_string(call->filter); } } + mutex_unlock(&event_mutex); +} + +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred, + filter_pred_fn_t fn) +{ + struct event_filter *filter = call->filter; + int idx, err; + + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + + idx = filter->n_preds; + filter_clear_pred(filter->preds[idx]); + err = filter_set_pred(filter->preds[idx], pred, fn); + if (err) + return err; - return -ENOSPC; + filter->n_preds++; + call->filter_active = 1; + + return 0; } +enum { + FILTER_STATIC_STRING = 1, + FILTER_DYN_STRING +}; + static int is_string_field(const char *type) { if (strchr(type, '[') && strstr(type, "char")) - return 1; + return FILTER_STATIC_STRING; + + if (!strcmp(type, "__str_loc")) + return FILTER_DYN_STRING; return 0; } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; + filter_pred_fn_t fn; + unsigned long long val; + int string_type; + + pred->fn = filter_pred_none; + + if (pred->op == OP_AND) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_and); + } else if (pred->op == OP_OR) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_or); + } field = find_event_field(call, pred->field_name); - if (!field) + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return -EINVAL; + } pred->offset = field->offset; - if (is_string_field(field->type)) { - if (!pred->str_val) - return -EINVAL; - pred->fn = filter_pred_string; + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + + string_type = is_string_field(field->type); + if (string_type) { + if (string_type == FILTER_STATIC_STRING) + fn = filter_pred_string; + else + fn = filter_pred_strloc; pred->str_len = field->size; - return __filter_add_pred(call, pred); + if (pred->op == OP_NE) + pred->not = 1; + return filter_add_pred_fn(ps, call, pred, fn); } else { - if (pred->str_val) + if (strict_strtoull(pred->str_val, 0, &val)) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; + } + pred->val = val; } - switch (field->size) { - case 8: - pred->fn = filter_pred_64; - break; - case 4: - pred->fn = filter_pred_32; - break; - case 2: - pred->fn = filter_pred_16; - break; - case 1: - pred->fn = filter_pred_8; - break; - default: + fn = select_comparison_fn(pred->op, field->size, field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - return __filter_add_pred(call, pred); + if (pred->op == OP_NE) + pred->not = 1; + + return filter_add_pred_fn(ps, call, pred, fn); } -static struct filter_pred *copy_pred(struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, + struct event_subsystem *system, + struct filter_pred *pred, + char *filter_string) { - struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); - if (!new_pred) - return NULL; + struct event_filter *filter = system->filter; + struct ftrace_event_call *call; + int err = 0; + + if (!filter->preds) { + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); - memcpy(new_pred, pred, sizeof(*pred)); + if (!filter->preds) + return -ENOMEM; + } - if (pred->field_name) { - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!new_pred->field_name) { - kfree(new_pred); - return NULL; - } + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; } - if (pred->str_val) { - new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - if (!new_pred->str_val) { - filter_free_pred(new_pred); - return NULL; + filter->preds[filter->n_preds] = pred; + filter->n_preds++; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name)) + continue; + + err = filter_add_pred(ps, call, pred); + if (err) { + mutex_unlock(&event_mutex); + filter_free_subsystem_preds(system); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + goto out; } + replace_filter_string(call->filter, filter_string); } + mutex_unlock(&event_mutex); +out: + return err; +} + +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) +{ + memset(ps, '\0', sizeof(*ps)); - return new_pred; + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); } -int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred) +static char infix_next(struct filter_parse_state *ps) { - struct ftrace_event_call *call = __start_ftrace_events; - struct filter_pred *event_pred; - int i; + ps->infix.cnt--; - if (system->preds && !pred->compound) - filter_free_subsystem_preds(system); + return ps->infix.string[ps->infix.tail++]; +} - if (!system->preds) { - system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!system->preds) - return -ENOMEM; +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!system->preds[i]) { - system->preds[i] = pred; - break; + return 0; +} + +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; } } - if (i == MAX_FILTER_PRED) - return -ENOSPC; + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + kfree(elt->operand); + list_del(&elt->list); + } +} - events_for_each(call) { - int err; +static int filter_parse(struct filter_parse_state *ps) +{ + int in_string = 0; + int op, top_op; + char ch; - if (!call->name || !call->regfunc) + while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; continue; + } - if (strcmp(call->system, system->name)) + if (in_string) + goto parse_operand; + + if (isspace(ch)) continue; - if (!find_event_field(call, pred->field_name)) + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); continue; + } - event_pred = copy_pred(pred); - if (!event_pred) - goto oom; + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } - err = filter_add_pred(call, event_pred); - if (err) - filter_free_pred(event_pred); - if (err == -ENOMEM) - goto oom; + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + continue; + } +parse_operand: + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); } return 0; +} -oom: - system->preds[i] = NULL; - return -ENOMEM; +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->str_val, operand2); + pred->str_len = strlen(operand2); + + pred->op = op; + + return pred; } -int filter_parse(char **pbuf, struct filter_pred *pred) +static struct filter_pred *create_logical_pred(int op) { - char *tmp, *tok, *val_str = NULL; - int tok_n = 0; + struct filter_pred *pred; - /* field ==/!= number, or/and field ==/!= number, number */ - while ((tok = strsep(pbuf, " \n"))) { - if (tok_n == 0) { - if (!strcmp(tok, "0")) { - pred->clear = 1; - return 0; - } else if (!strcmp(tok, "&&")) { - pred->or = 0; - pred->compound = 1; - } else if (!strcmp(tok, "||")) { - pred->or = 1; - pred->compound = 1; - } else - pred->field_name = tok; - tok_n = 1; + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; continue; } - if (tok_n == 1) { - if (!pred->field_name) - pred->field_name = tok; - else if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; + n_normal_preds++; + } + + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); + return -EINVAL; + } + + return 0; +} + +static int replace_preds(struct event_subsystem *system, + struct ftrace_event_call *call, + struct filter_parse_state *ps, + char *filter_string) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct postfix_elt *elt; + int err; + + err = check_preds(ps); + if (err) + return err; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; else { - pred->field_name = NULL; + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); return -EINVAL; } - tok_n = 2; continue; } - if (tok_n == 2) { - if (pred->compound) { - if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - } else { - val_str = tok; - break; /* done */ - } - tok_n = 3; + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, + pred, filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; continue; } - if (tok_n == 3) { - val_str = tok; - break; /* done */ + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return -EINVAL; } + + pred = create_pred(elt->op, operand1, operand2); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, pred, + filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; } - if (!val_str) { - pred->field_name = NULL; - return -EINVAL; + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable_preds(call); + remove_filter_string(call->filter); + mutex_unlock(&filter_mutex); + return 0; } - pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!pred->field_name) + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) return -ENOMEM; - pred->val = simple_strtoull(val_str, &tmp, 0); - if (tmp == val_str) { - pred->str_val = kstrdup(val_str, GFP_KERNEL); - if (!pred->str_val) - return -ENOMEM; - } else if (*tmp != '\0') - return -EINVAL; + filter_disable_preds(call); + replace_filter_string(call->filter, filter_string); - return 0; + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, call->filter); + goto out; + } + + err = replace_preds(NULL, call, ps, filter_string); + if (err) + append_filter_err(ps, call->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; } +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_free_subsystem_preds(system); + replace_filter_string(system->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_preds(system, NULL, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 38985f9b379..00000000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * struct ftrace_raw_<call> { - * struct trace_entry ent; - * <type> <item>; - * <type2> <item2>[<len>]; - * [...] - * }; - * - * The <type> <item> is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len) type item[len]; - -#undef __field -#define __field(type, item) type item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name - -#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index d363c6672c6..00000000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * enum print_line_t - * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags) - * { - * struct trace_seq *s = &iter->seq; - * struct ftrace_raw_<call> *field; <-- defined in stage 1 - * struct trace_entry *entry; - * int ret; - * - * entry = iter->ent; - * - * if (entry->type != event_<call>.id) { - * WARN_ON_ONCE(1); - * return TRACE_TYPE_UNHANDLED; - * } - * - * field = (typeof(field))entry; - * - * ret = trace_seq_printf(s, <TP_printk> "\n"); - * if (!ret) - * return TRACE_TYPE_PARTIAL_LINE; - * - * return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ -{ \ - struct trace_seq *s = &iter->seq; \ - struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - int ret; \ - \ - entry = iter->ent; \ - \ - if (entry->type != event_##call.id) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - ret = trace_seq_printf(s, #call ": " print); \ - if (!ret) \ - return TRACE_TYPE_PARTIAL_LINE; \ - \ - return TRACE_TYPE_HANDLED; \ -} - -#include <trace/trace_event_types.h> - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include <trace/trace_event_types.h> - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ - int ret; \ - \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ - \ - tstruct; \ - \ - return ret; \ -} - -#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h deleted file mode 100644 index 9d2fa78cecc..00000000000 --- a/kernel/trace/trace_events_stage_3.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Stage 3 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * static void ftrace_event_<call>(proto) - * { - * event_trace_printk(_RET_IP_, "<call>: " <fmt>); - * } - * - * static int ftrace_reg_event_<call>(void) - * { - * int ret; - * - * ret = register_trace_<call>(ftrace_event_<call>); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to <call>"); - * return ret; - * } - * - * static void ftrace_unreg_event_<call>(void) - * { - * unregister_trace_<call>(ftrace_event_<call>); - * } - * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_<call> = { - * .name = "<call>", - * .regfunc = ftrace_reg_event_<call>, - * .unregfunc = ftrace_unreg_event_<call>, - * } - * - * - * For those macros defined with TRACE_EVENT: - * - * static struct ftrace_event_call event_<call>; - * - * static void ftrace_raw_event_<call>(proto) - * { - * struct ring_buffer_event *event; - * struct ftrace_raw_<call> *entry; <-- defined in stage 1 - * unsigned long irq_flags; - * int pc; - * - * local_save_flags(irq_flags); - * pc = preempt_count(); - * - * event = trace_current_buffer_lock_reserve(event_<call>.id, - * sizeof(struct ftrace_raw_<call>), - * irq_flags, pc); - * if (!event) - * return; - * entry = ring_buffer_event_data(event); - * - * <assign>; <-- Here we assign the entries by the __field and - * __array macros. - * - * trace_current_buffer_unlock_commit(event, irq_flags, pc); - * } - * - * static int ftrace_raw_reg_event_<call>(void) - * { - * int ret; - * - * ret = register_trace_<call>(ftrace_raw_event_<call>); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to <call>"); - * return ret; - * } - * - * static void ftrace_unreg_event_<call>(void) - * { - * unregister_trace_<call>(ftrace_raw_event_<call>); - * } - * - * static struct trace_event ftrace_event_type_<call> = { - * .trace = ftrace_raw_output_<call>, <-- stage 2 - * }; - * - * static int ftrace_raw_init_event_<call>(void) - * { - * int id; - * - * id = register_ftrace_event(&ftrace_event_type_<call>); - * if (!id) - * return -ENODEV; - * event_<call>.id = id; - * return 0; - * } - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_<call> = { - * .name = "<call>", - * .system = "<system>", - * .raw_init = ftrace_raw_init_event_<call>, - * .regfunc = ftrace_reg_event_<call>, - * .unregfunc = ftrace_unreg_event_<call>, - * .show_format = ftrace_format_<call>, - * } - * - */ - -#undef TP_FMT -#define TP_FMT(fmt, args...) fmt "\n", ##args - -#ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ -{ \ - if (atomic_add_negative(-1, &call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} - -#define _TRACE_PROFILE_INIT(call) \ - .profile_count = ATOMIC_INIT(-1), \ - .profile_enable = ftrace_profile_enable_##call, \ - .profile_disable = ftrace_profile_disable_##call, - -#else -#define _TRACE_PROFILE(call, proto, args) -#define _TRACE_PROFILE_INIT(call) -#endif - -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#undef __entry -#define __entry entry - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ - \ -static struct ftrace_event_call event_##call; \ - \ -static void ftrace_raw_event_##call(proto) \ -{ \ - struct ftrace_event_call *call = &event_##call; \ - struct ring_buffer_event *event; \ - struct ftrace_raw_##call *entry; \ - unsigned long irq_flags; \ - int pc; \ - \ - local_save_flags(irq_flags); \ - pc = preempt_count(); \ - \ - event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ - if (!event) \ - return; \ - entry = ring_buffer_event_data(event); \ - \ - assign; \ - \ - if (call->preds && !filter_match_preds(call, entry)) \ - ring_buffer_event_discard(event); \ - \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ -} \ - \ -static int ftrace_raw_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_raw_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_raw_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_raw_event_##call); \ -} \ - \ -static struct trace_event ftrace_event_type_##call = { \ - .trace = ftrace_raw_output_##call, \ -}; \ - \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(&ftrace_event_type_##call); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ - \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .regfunc = ftrace_raw_reg_event_##call, \ - .unregfunc = ftrace_raw_unreg_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#include <trace/trace_event_types.h> - -#undef _TRACE_PROFILE -#undef _TRACE_PROFILE_INIT - diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c33ebf..d06cf898dc8 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@ #undef TRACE_STRUCT #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); + #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ + if (sizeof(type) != sizeof(field.item)) \ + __bad_type_size(); \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -30,7 +34,7 @@ #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -46,6 +50,9 @@ if (!ret) \ return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) #undef TP_RAW_FMT #define TP_RAW_FMT(args...) args @@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + #include "trace_event_types.h" #undef TRACE_ZERO_CHAR @@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) + #undef TP_CMD #define TP_CMD(cmd...) cmd @@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_ENTRY entry #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int ftrace_define_fields_##call(void); \ +static int ftrace_raw_init_event_##call(void); \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ + .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ + return 0; \ +} \ + +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ \ -static struct ftrace_event_call __used \ +struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ .show_format = ftrace_format_##call, \ +}; + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), 0); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct args field; \ + int ret; \ + \ + __common_field(unsigned char, type, 0); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ + \ + tstruct; \ + \ + return ret; \ } + +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) + #include "trace_event_types.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..10f6ad7d85f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; @@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; - barrier(); - current->curr_ret_stack--; - } /* @@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void) ftrace_pop_return_trace(&trace, &ret); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; if (unlikely(!ret)) { ftrace_graph_stop(); @@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_HANDLED; } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ @@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; ret = trace_seq_printf(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; } /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..8683d50a753 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) void trace_hw_branch(u64 from, u64 to) { + struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; @@ -194,7 +195,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcddd8b..d53b45ed080 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <linux/time.h> + #include <asm/atomic.h> #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64b54a59c55..8bd9a2c1a46 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_init(s); +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -84,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_seq_printf); int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { @@ -472,6 +483,36 @@ struct trace_event *ftrace_find_event(int type) return NULL; } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -494,20 +535,40 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); - if (!event) { - ret = next_event_type++; + if (WARN_ON(!event)) goto out; - } - if (!event->type) - event->type = next_event_type++; - else if (event->type > __TRACE_LAST_TYPE) { + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list = NULL; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { printk(KERN_WARNING "Need to add type to trace.h\n"); WARN_ON(1); - } - - if (ftrace_find_event(event->type)) goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } if (event->trace == NULL) event->trace = trace_nop_print; @@ -528,6 +589,7 @@ int register_ftrace_event(struct trace_event *event) return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_event); /** * unregister_ftrace_event - remove a no longer used event @@ -537,10 +599,12 @@ int unregister_ftrace_event(struct trace_event *event) { mutex_lock(&trace_event_mutex); hlist_del(&event->node); + list_del(&event->list); mutex_unlock(&trace_event_mutex); return 0; } +EXPORT_SYMBOL_GPL(unregister_ftrace_event); /* * Standard events diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index e0bde39c2dd..6e220a8e570 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,41 +1,17 @@ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H +#include <linux/trace_seq.h> #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags); - -struct trace_event { - struct hlist_node node; - int type; - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, - size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, @@ -45,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 118439709fb..8a30d9874cd 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type, static void probe_power_end(struct power_trace *it) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it) static void probe_power_mark(struct power_trace *it, unsigned int type, unsigned int level) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556107f..9bece9687b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; - entry = debugfs_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, d_tracer, NULL, &ftrace_formats_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'printk_formats' entry\n"); return 0; } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea6f1a..a98106dd979 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include "trace.h" @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref || sched_stopped) + if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - if (!tracer_enabled) + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) unsigned long flags; int cpu, pc; - if (!likely(tracer_enabled)) + if (unlikely(!sched_ref)) return; - pc = preempt_count(); tracing_record_cmdline(current); - if (sched_stopped) + if (!tracer_enabled || sched_stopped) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f153..eacb2722517 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include "trace.h" @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, pc = preempt_count(); - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65f966..1796f00524e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); - entry = debugfs_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_trace' entry\n"); + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index acdebd771a9..fdde3a4a94c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; - stat = ts->stat_start(); + stat = ts->stat_start(ts); if (!stat) goto exit; diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274cf7f3..f3546a2cd82 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat { /* The name of your stat file */ const char *name; /* Iteration over statistic entries */ - void *(*stat_start)(void); + void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ int (*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149..e04b76cc238 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = { void init_tracer_sysprof_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("sysprof_sample_period", 0644, + trace_create_file("sysprof_sample_period", 0644, d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 797201e4a13..984b9175c13 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) return ret; } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace) { int cpu; void *ret = NULL; |