From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Dec 2008 13:42:23 -0800 Subject: tracing/kmemtrace: normalize the raw tracer event to the unified tracing API Impact: new tracer plugin This patch adapts kmemtrace raw events tracing to the unified tracing API. To enable and use this tracer, just do the following: echo kmemtrace > /debugfs/tracing/current_tracer cat /debugfs/tracing/trace You will have the following output: # tracer: kmemtrace # # # ALLOC TYPE REQ GIVEN FLAGS POINTER NODE CALLER # FREE | | | | | | | | # | type_id 1 call_site 18446744071565527833 ptr 18446612134395152256 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1 type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 That was to stay backward compatible with the format output produced in inux/tracepoint.h. This is the default ouput, but note that I tried something else. If you change an option: echo kmem_minimalistic > /debugfs/trace_options and then cat /debugfs/trace, you will have the following output: # tracer: kmemtrace # # # ALLOC TYPE REQ GIVEN FLAGS POINTER NODE CALLER # FREE | | | | | | | | # | - C 0xffff88007c088780 file_free_rcu + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname + K 240 240 000000d0 0xffff8800790dc780 -1 d_alloc - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname + K 240 240 000000d0 0xffff8800790dc870 -1 d_alloc - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname + K 240 240 000000d0 0xffff8800790dc960 -1 d_alloc + K 1304 1312 000000d0 0xffff8800791d7340 -1 reiserfs_alloc_inode - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname - C 0xffff88007cad6000 putname + K 992 1000 000000d0 0xffff880079045b58 -1 alloc_inode + K 768 1024 000080d0 0xffff88007c096400 -1 alloc_pipe_info + K 240 240 000000d0 0xffff8800790dca50 -1 d_alloc + K 272 320 000080d0 0xffff88007c088780 -1 get_empty_filp + K 272 320 000080d0 0xffff88007c088000 -1 get_empty_filp Yeah I shall confess kmem_minimalistic should be: kmem_alternative. Whatever, I find it more readable but this a personal opinion of course. We can drop it if you want. On the ALLOC/FREE column, + means an allocation and - a free. On the type column, you have K = kmalloc, C = cache, P = page I would like the flags to be GFP_* strings but that would not be easy to not break the column with strings.... About the node...it seems to always be -1. I don't know why but that shouldn't be difficult to find. I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would be more easy to find the tracer headers if they are all in their common directory. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e2a4ff6fc3a..27fb74b06b3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -264,6 +264,28 @@ config HW_BRANCH_TRACER This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. +config KMEMTRACE + bool "Trace SLAB allocations" + select TRACING + depends on RELAY + help + kmemtrace provides tracing for slab allocator functions, such as + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + data is then fed to the userspace application in order to analyse + allocation hotspots, internal fragmentation and so on, making it + possible to see how well an allocator performs, as well as debug + and profile kernel code. + + This requires an userspace application to use. See + Documentation/vm/kmemtrace.txt for more information. + + Saying Y will make the kernel somewhat larger and slower. However, + if you disable kmemtrace at run-time or boot-time, the performance + impact is minimal (depending on the arch the kernel is built for). + + If unsure, say N. + + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER -- cgit v1.2.3-70-g09d2 From 3fd4bc015ef879a7d2b955ce97fb125e3a51ba7e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Dec 2008 12:07:27 +0100 Subject: tracing/kmemtrace: export kmemtrace_mark_alloc_node() / kmemtrace_mark_free() Impact: build fix Also fix up Kconfig dependencies and include files. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 3 ++- kernel/trace/kmemtrace.c | 2 ++ mm/slab.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 27fb74b06b3..cc9f91e7daf 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -267,7 +267,8 @@ config HW_BRANCH_TRACER config KMEMTRACE bool "Trace SLAB allocations" select TRACING - depends on RELAY + select MARKERS + select RELAY help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index d69cbe3c2a4..2bfdcd32622 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -296,6 +296,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, trace_wake_up(); } +EXPORT_SYMBOL(kmemtrace_mark_alloc_node); void kmemtrace_mark_free(enum kmemtrace_type_id type_id, unsigned long call_site, @@ -325,6 +326,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id, trace_wake_up(); } +EXPORT_SYMBOL(kmemtrace_mark_free); static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", diff --git a/mm/slab.c b/mm/slab.c index bcf08ea8838..7f72bb386a0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 723cbe0775514853c22dc45005af59c360916af1 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Mon, 5 Jan 2009 22:09:58 +0200 Subject: kmemtrace: Remove the relay version of kmemtrace Impact: cleanup kmemtrace now uses ftrace. This patch removes the relay version. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 - mm/Makefile | 1 - mm/kmemtrace.c | 333 --------------------------------------------------- 3 files changed, 336 deletions(-) delete mode 100644 mm/kmemtrace.c (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cc9f91e7daf..1c0b7504cab 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -267,8 +267,6 @@ config HW_BRANCH_TRACER config KMEMTRACE bool "Trace SLAB allocations" select TRACING - select MARKERS - select RELAY help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected diff --git a/mm/Makefile b/mm/Makefile index c92e8af1320..51c27709cc7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -35,4 +35,3 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c deleted file mode 100644 index 0573b5080cc..00000000000 --- a/mm/kmemtrace.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define KMEMTRACE_SUBBUF_SIZE 524288 -#define KMEMTRACE_DEF_N_SUBBUFS 20 - -static struct rchan *kmemtrace_chan; -static u32 kmemtrace_buf_overruns; - -static unsigned int kmemtrace_n_subbufs; - -/* disabled by default */ -static unsigned int kmemtrace_enabled; - -/* - * The sequence number is used for reordering kmemtrace packets - * in userspace, since they are logged as per-CPU data. - * - * atomic_t should always be a 32-bit signed integer. Wraparound is not - * likely to occur, but userspace can deal with it by expecting a certain - * sequence number in the next packet that will be read. - */ -static atomic_t kmemtrace_seq_num; - -#define KMEMTRACE_ABI_VERSION 1 - -static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION; - -enum kmemtrace_event_id { - KMEMTRACE_EVENT_ALLOC = 0, - KMEMTRACE_EVENT_FREE, -}; - -struct kmemtrace_event { - u8 event_id; - u8 type_id; - u16 event_size; - s32 seq_num; - u64 call_site; - u64 ptr; -} __attribute__ ((__packed__)); - -struct kmemtrace_stats_alloc { - u64 bytes_req; - u64 bytes_alloc; - u32 gfp_flags; - s32 numa_node; -} __attribute__ ((__packed__)); - -static void kmemtrace_probe_alloc(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - unsigned long flags; - struct kmemtrace_event *ev; - struct kmemtrace_stats_alloc *stats; - void *buf; - - local_irq_save(flags); - - buf = relay_reserve(kmemtrace_chan, - sizeof(struct kmemtrace_event) + - sizeof(struct kmemtrace_stats_alloc)); - if (!buf) - goto failed; - - /* - * Don't convert this to use structure initializers, - * C99 does not guarantee the rvalues evaluation order. - */ - - ev = buf; - ev->event_id = KMEMTRACE_EVENT_ALLOC; - ev->type_id = va_arg(*args, int); - ev->event_size = sizeof(struct kmemtrace_event) + - sizeof(struct kmemtrace_stats_alloc); - ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); - ev->call_site = va_arg(*args, unsigned long); - ev->ptr = va_arg(*args, unsigned long); - - stats = buf + sizeof(struct kmemtrace_event); - stats->bytes_req = va_arg(*args, unsigned long); - stats->bytes_alloc = va_arg(*args, unsigned long); - stats->gfp_flags = va_arg(*args, unsigned long); - stats->numa_node = va_arg(*args, int); - -failed: - local_irq_restore(flags); -} - -static void kmemtrace_probe_free(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - unsigned long flags; - struct kmemtrace_event *ev; - - local_irq_save(flags); - - ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event)); - if (!ev) - goto failed; - - /* - * Don't convert this to use structure initializers, - * C99 does not guarantee the rvalues evaluation order. - */ - ev->event_id = KMEMTRACE_EVENT_FREE; - ev->type_id = va_arg(*args, int); - ev->event_size = sizeof(struct kmemtrace_event); - ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); - ev->call_site = va_arg(*args, unsigned long); - ev->ptr = va_arg(*args, unsigned long); - -failed: - local_irq_restore(flags); -} - -static struct dentry * -kmemtrace_create_buf_file(const char *filename, struct dentry *parent, - int mode, struct rchan_buf *buf, int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static int kmemtrace_remove_buf_file(struct dentry *dentry) -{ - debugfs_remove(dentry); - - return 0; -} - -static int kmemtrace_subbuf_start(struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - size_t prev_padding) -{ - if (relay_buf_full(buf)) { - /* - * We know it's not SMP-safe, but neither - * debugfs_create_u32() is. - */ - kmemtrace_buf_overruns++; - return 0; - } - - return 1; -} - -static struct rchan_callbacks relay_callbacks = { - .create_buf_file = kmemtrace_create_buf_file, - .remove_buf_file = kmemtrace_remove_buf_file, - .subbuf_start = kmemtrace_subbuf_start, -}; - -static struct dentry *kmemtrace_dir; -static struct dentry *kmemtrace_overruns_dentry; -static struct dentry *kmemtrace_abi_version_dentry; - -static struct dentry *kmemtrace_enabled_dentry; - -static int kmemtrace_start_probes(void) -{ - int err; - - err = marker_probe_register("kmemtrace_alloc", "type_id %d " - "call_site %lu ptr %lu " - "bytes_req %lu bytes_alloc %lu " - "gfp_flags %lu node %d", - kmemtrace_probe_alloc, NULL); - if (err) - return err; - err = marker_probe_register("kmemtrace_free", "type_id %d " - "call_site %lu ptr %lu", - kmemtrace_probe_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - marker_probe_unregister("kmemtrace_alloc", - kmemtrace_probe_alloc, NULL); - marker_probe_unregister("kmemtrace_free", - kmemtrace_probe_free, NULL); -} - -static int kmemtrace_enabled_get(void *data, u64 *val) -{ - *val = *((int *) data); - - return 0; -} - -static int kmemtrace_enabled_set(void *data, u64 val) -{ - u64 old_val = kmemtrace_enabled; - - *((int *) data) = !!val; - - if (old_val == val) - return 0; - if (val) - kmemtrace_start_probes(); - else - kmemtrace_stop_probes(); - - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops, - kmemtrace_enabled_get, - kmemtrace_enabled_set, "%llu\n"); - -static void kmemtrace_cleanup(void) -{ - if (kmemtrace_enabled_dentry) - debugfs_remove(kmemtrace_enabled_dentry); - - kmemtrace_stop_probes(); - - if (kmemtrace_abi_version_dentry) - debugfs_remove(kmemtrace_abi_version_dentry); - if (kmemtrace_overruns_dentry) - debugfs_remove(kmemtrace_overruns_dentry); - - relay_close(kmemtrace_chan); - kmemtrace_chan = NULL; - - if (kmemtrace_dir) - debugfs_remove(kmemtrace_dir); -} - -static int __init kmemtrace_setup_late(void) -{ - if (!kmemtrace_chan) - goto failed; - - kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL); - if (!kmemtrace_dir) - goto cleanup; - - kmemtrace_abi_version_dentry = - debugfs_create_u32("abi_version", S_IRUSR, - kmemtrace_dir, &kmemtrace_abi_version); - kmemtrace_overruns_dentry = - debugfs_create_u32("total_overruns", S_IRUSR, - kmemtrace_dir, &kmemtrace_buf_overruns); - if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry) - goto cleanup; - - kmemtrace_enabled_dentry = - debugfs_create_file("enabled", S_IRUSR | S_IWUSR, - kmemtrace_dir, &kmemtrace_enabled, - &kmemtrace_enabled_fops); - if (!kmemtrace_enabled_dentry) - goto cleanup; - - if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir)) - goto cleanup; - - printk(KERN_INFO "kmemtrace: fully up.\n"); - - return 0; - -cleanup: - kmemtrace_cleanup(); -failed: - return 1; -} -late_initcall(kmemtrace_setup_late); - -static int __init kmemtrace_set_boot_enabled(char *str) -{ - if (!str) - return -EINVAL; - - if (!strcmp(str, "yes")) - kmemtrace_enabled = 1; - else if (!strcmp(str, "no")) - kmemtrace_enabled = 0; - else - return -EINVAL; - - return 0; -} -early_param("kmemtrace.enable", kmemtrace_set_boot_enabled); - -static int __init kmemtrace_set_subbufs(char *str) -{ - get_option(&str, &kmemtrace_n_subbufs); - return 0; -} -early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); - -void kmemtrace_init(void) -{ - if (!kmemtrace_n_subbufs) - kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; - - kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, - kmemtrace_n_subbufs, &relay_callbacks, - NULL); - if (!kmemtrace_chan) { - printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); - return; - } - - if (!kmemtrace_enabled) { - printk(KERN_INFO "kmemtrace: disabled. Pass " - "kemtrace.enable=yes as kernel parameter for " - "boot-time tracing.\n"); - return; - } - if (kmemtrace_start_probes()) { - printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); - kmemtrace_cleanup(); - return; - } - - printk(KERN_INFO "kmemtrace: enabled.\n"); -} - -- cgit v1.2.3-70-g09d2 From fe6f90e57fd31af8daca534ea01db2e5666c15da Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 3 Jan 2009 21:23:51 +0200 Subject: trace: mmiotrace to the tracer menu in Kconfig Impact: cosmetic change in Kconfig menu layout This patch was originally suggested by Peter Zijlstra, but seems it was forgotten. CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable directly under the Kernel hacking / debugging menu in the kernel configuration system. They were present only for x86 and x86_64. Other tracers that use the ftrace tracing framework are in their own sub-menu. This patch moves the mmiotrace configuration options there. Since the Kconfig file, where the tracer menu is, is not architecture specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by x86/x86_64. CONFIG_MMIOTRACE now depends on it. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 24 ++---------------------- kernel/trace/Kconfig | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'kernel/trace/Kconfig') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 10d6cc3fd05..e1983fa025d 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -174,28 +174,8 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE - bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && PCI - select TRACING - help - Mmiotrace traces Memory Mapped I/O access and is meant for - debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. Tracing is disabled by - default and can be enabled at run-time. - - See Documentation/tracers/mmiotrace.txt. - If you are not helping to develop drivers, say N. - -config MMIOTRACE_TEST - tristate "Test module for mmiotrace" - depends on MMIOTRACE && m - help - This is a dumb module for testing mmiotrace. It is very dangerous - as it will write garbage to IO memory starting at a given address. - However, it should be safe to use on e.g. unused portion of VRAM. - - Say N, unless you absolutely know what you are doing. +config HAVE_MMIOTRACE_SUPPORT + def_bool y # # IO delay types: diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1c0b7504cab..944239296f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -323,4 +323,27 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + select TRACING + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/tracers/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + endmenu -- cgit v1.2.3-70-g09d2 From e1d8aa9f1dd655a3534b22fcfbecb70cdb125766 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Jan 2009 23:15:46 +0100 Subject: tracing: add a new workqueue tracer Impact: new tracer The workqueue tracer provides some statistical informations about each cpu workqueue thread such as the number of the works inserted and executed since their creation. It can help to evaluate the amount of work each of them have to perform. For example it can help a developer to decide whether he should choose a per cpu workqueue instead of a singlethreaded one. It only traces statistical informations for now but it will probably later provide event tracing too. Such a tracer could help too, and be improved, to help rt priority sorted workqueue development. To have a snapshot of the workqueues state at any time, just do cat /debugfs/tracing/trace_stat/workqueues Ie: 1 125 125 reiserfs/1 1 0 0 scsi_tgtd/1 1 0 0 aio/1 1 0 0 ata/1 1 114 114 kblockd/1 1 0 0 kintegrityd/1 1 2147 2147 events/1 0 0 0 kpsmoused 0 105 105 reiserfs/0 0 0 0 scsi_tgtd/0 0 0 0 aio/0 0 0 0 ata_aux 0 0 0 ata/0 0 0 0 cqueue 0 0 0 kacpi_notify 0 0 0 kacpid 0 149 149 kblockd/0 0 0 0 kintegrityd/0 0 1000 1000 khelper 0 2270 2270 events/0 Changes in V2: _ Drop the static array based on NR_CPU and dynamically allocate the stat array with num_possible_cpus() and other cpu mask facilities.... _ Trace workqueue insertion at a bit lower level (insert_work instead of queue_work) to handle even the workqueue barriers. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/trace/workqueue.h | 25 ++++ kernel/trace/Kconfig | 11 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_workqueue.c | 287 +++++++++++++++++++++++++++++++++++++++++ kernel/workqueue.c | 16 ++- 5 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 include/trace/workqueue.h create mode 100644 kernel/trace/trace_workqueue.c (limited to 'kernel/trace/Kconfig') diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h new file mode 100644 index 00000000000..867829df457 --- /dev/null +++ b/include/trace/workqueue.h @@ -0,0 +1,25 @@ +#ifndef __TRACE_WORKQUEUE_H +#define __TRACE_WORKQUEUE_H + +#include +#include +#include + +DECLARE_TRACE(workqueue_insertion, + TPPROTO(struct task_struct *wq_thread, struct work_struct *work), + TPARGS(wq_thread, work)); + +DECLARE_TRACE(workqueue_execution, + TPPROTO(struct task_struct *wq_thread, struct work_struct *work), + TPARGS(wq_thread, work)); + +/* Trace the creation of one workqueue thread on a cpu */ +DECLARE_TRACE(workqueue_creation, + TPPROTO(struct task_struct *wq_thread, int cpu), + TPARGS(wq_thread, cpu)); + +DECLARE_TRACE(workqueue_destruction, + TPPROTO(struct task_struct *wq_thread), + TPARGS(wq_thread)); + +#endif /* __TRACE_WORKQUEUE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 944239296f1..dde1d46f77e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -284,6 +284,17 @@ config KMEMTRACE If unsure, say N. +config WORKQUEUE_TRACER + bool "Trace workqueues" + select TRACING + help + The workqueue tracer provides some statistical informations + about each cpu workqueue thread such as the number of the + works inserted and executed since their creation. It can help + to evaluate the amount of work each of them have to perform. + For example it can help a developer to decide whether he should + choose a per cpu workqueue instead of a singlethreaded one. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 05c9182061d..f76d48f3527 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -36,5 +36,6 @@ obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o +obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c new file mode 100644 index 00000000000..f8118d39ca9 --- /dev/null +++ b/kernel/trace/trace_workqueue.c @@ -0,0 +1,287 @@ +/* + * Workqueue statistical tracer. + * + * Copyright (C) 2008 Frederic Weisbecker + * + */ + + +#include +#include +#include "trace_stat.h" +#include "trace.h" + + +/* A cpu workqueue thread */ +struct cpu_workqueue_stats { + struct list_head list; +/* Useful to know if we print the cpu headers */ + bool first_entry; + int cpu; + pid_t pid; +/* Can be inserted from interrupt or user context, need to be atomic */ + atomic_t inserted; +/* + * Don't need to be atomic, works are serialized in a single workqueue thread + * on a single CPU. + */ + unsigned int executed; +}; + +/* List of workqueue threads on one cpu */ +struct workqueue_global_stats { + struct list_head list; + spinlock_t lock; +}; + +/* Don't need a global lock because allocated before the workqueues, and + * never freed. + */ +static struct workqueue_global_stats *all_workqueue_stat; + +/* Insertion of a work */ +static void +probe_workqueue_insertion(struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + list) { + if (node->pid == wq_thread->pid) { + atomic_inc(&node->inserted); + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); +} + +/* Execution of a work */ +static void +probe_workqueue_execution(struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + list) { + if (node->pid == wq_thread->pid) { + node->executed++; + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); +} + +/* Creation of a cpu workqueue thread */ +static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) +{ + struct cpu_workqueue_stats *cws; + unsigned long flags; + + WARN_ON(cpu < 0 || cpu >= num_possible_cpus()); + + /* Workqueues are sometimes created in atomic context */ + cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); + if (!cws) { + pr_warning("trace_workqueue: not enough memory\n"); + return; + } + tracing_record_cmdline(wq_thread); + + INIT_LIST_HEAD(&cws->list); + cws->cpu = cpu; + + cws->pid = wq_thread->pid; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + if (list_empty(&all_workqueue_stat[cpu].list)) + cws->first_entry = true; + list_add_tail(&cws->list, &all_workqueue_stat[cpu].list); + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); +} + +/* Destruction of a cpu workqueue thread */ +static void probe_workqueue_destruction(struct task_struct *wq_thread) +{ + /* Workqueue only execute on one cpu */ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + list) { + if (node->pid == wq_thread->pid) { + list_del(&node->list); + kfree(node); + goto found; + } + } + + pr_debug("trace_workqueue: don't find workqueue to destroy\n"); +found: + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + +} + +static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) +{ + unsigned long flags; + struct cpu_workqueue_stats *ret = NULL; + + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + + if (!list_empty(&all_workqueue_stat[cpu].list)) + ret = list_entry(all_workqueue_stat[cpu].list.next, + struct cpu_workqueue_stats, list); + + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + + return ret; +} + +static void *workqueue_stat_start(void) +{ + int cpu; + void *ret = NULL; + + for_each_possible_cpu(cpu) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; +} + +static void *workqueue_stat_next(void *prev, int idx) +{ + struct cpu_workqueue_stats *prev_cws = prev; + int cpu = prev_cws->cpu; + unsigned long flags; + void *ret = NULL; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) { + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + for (++cpu ; cpu < num_possible_cpus(); cpu++) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; + } + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + + return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, + list); +} + +static int workqueue_stat_show(struct seq_file *s, void *p) +{ + struct cpu_workqueue_stats *cws = p; + unsigned long flags; + int cpu = cws->cpu; + + seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, + atomic_read(&cws->inserted), + cws->executed, + trace_find_cmdline(cws->pid)); + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + if (&cws->list == all_workqueue_stat[cpu].list.next) + seq_printf(s, "\n"); + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + + return 0; +} + +static int workqueue_stat_headers(struct seq_file *s) +{ + seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); + seq_printf(s, "# | | | |\n\n"); + return 0; +} + +struct tracer_stat workqueue_stats __read_mostly = { + .name = "workqueues", + .stat_start = workqueue_stat_start, + .stat_next = workqueue_stat_next, + .stat_show = workqueue_stat_show, + .stat_headers = workqueue_stat_headers +}; + + +int __init stat_workqueue_init(void) +{ + if (register_stat_tracer(&workqueue_stats)) { + pr_warning("Unable to register workqueue stat tracer\n"); + return 1; + } + + return 0; +} +fs_initcall(stat_workqueue_init); + +/* + * Workqueues are created very early, just after pre-smp initcalls. + * So we must register our tracepoints at this stage. + */ +int __init trace_workqueue_early_init(void) +{ + int ret, cpu; + + ret = register_trace_workqueue_insertion(probe_workqueue_insertion); + if (ret) + goto out; + + ret = register_trace_workqueue_execution(probe_workqueue_execution); + if (ret) + goto no_insertion; + + ret = register_trace_workqueue_creation(probe_workqueue_creation); + if (ret) + goto no_execution; + + ret = register_trace_workqueue_destruction(probe_workqueue_destruction); + if (ret) + goto no_creation; + + all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats) + * num_possible_cpus(), GFP_KERNEL); + + if (!all_workqueue_stat) { + pr_warning("trace_workqueue: not enough memory\n"); + goto no_creation; + } + + for_each_possible_cpu(cpu) { + spin_lock_init(&all_workqueue_stat[cpu].lock); + INIT_LIST_HEAD(&all_workqueue_stat[cpu].list); + } + + return 0; + +no_creation: + unregister_trace_workqueue_creation(probe_workqueue_creation); +no_execution: + unregister_trace_workqueue_execution(probe_workqueue_execution); +no_insertion: + unregister_trace_workqueue_insertion(probe_workqueue_insertion); +out: + pr_warning("trace_workqueue: unable to trace workqueues\n"); + + return 1; +} +early_initcall(trace_workqueue_early_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f445833ae3..1fc2bc20603 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,6 +33,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } +DEFINE_TRACE(workqueue_insertion); + static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { + trace_workqueue_insertion(cwq->thread, work); + set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the @@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +DEFINE_TRACE(workqueue_execution); + static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) */ struct lockdep_map lockdep_map = work->lockdep_map; #endif - + trace_workqueue_execution(cwq->thread, work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } +DEFINE_TRACE(workqueue_creation); + static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); cwq->thread = p; + trace_workqueue_creation(cwq->thread, cpu); + return 0; } @@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); +DEFINE_TRACE(workqueue_destruction); + static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* @@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) * checks list_empty(), and a "normal" queue_work() can't use * a dead CPU. */ + trace_workqueue_destruction(cwq->thread); kthread_stop(cwq->thread); cwq->thread = NULL; } -- cgit v1.2.3-70-g09d2 From 79fb0768fbd371f3b94d909f51f587b3a24ab272 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Feb 2009 21:38:33 -0500 Subject: trace: let boot trace be chosen by command line Now that we have a working ftrace= function, make the boot tracer get activated by it. This way we can turn it on or off without recompiling the kernel, as well as keeping the selftests on. The selftests are disabled whenever a default tracer starts running. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 7 +++---- kernel/trace/trace.c | 5 +---- kernel/trace/trace_boot.c | 11 +++++++---- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index dde1d46f77e..28f2644484d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,9 +164,8 @@ config BOOT_TRACER representation of the delays during initcalls - but the raw /debug/tracing/trace text output is readable too. - ( Note that tracing self tests can't be enabled if this tracer is - selected, because the self-tests are an initcall as well and that - would invalidate the boot trace. ) + You must pass in ftrace=initcall to the kernel command line + to enable this on bootup. config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" @@ -326,7 +325,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER + depends on TRACING && DEBUG_KERNEL select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c720c79bc6..40edef4255c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3167,12 +3167,9 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); register_tracer(&nop_trace); + current_trace = &nop_trace; #ifdef CONFIG_BOOT_TRACER register_tracer(&boot_tracer); - current_trace = &boot_tracer; - current_trace->init(&global_trace); -#else - current_trace = &nop_trace; #endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0e94b3d091f..1f07895977a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -28,13 +28,13 @@ void start_boot_trace(void) void enable_boot_trace(void) { - if (pre_initcalls_finished) + if (boot_trace && pre_initcalls_finished) tracing_start_sched_switch_record(); } void disable_boot_trace(void) { - if (pre_initcalls_finished) + if (boot_trace && pre_initcalls_finished) tracing_stop_sched_switch_record(); } @@ -43,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; + if (!tr) + return 0; + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); @@ -132,7 +135,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!pre_initcalls_finished) + if (!tr || !pre_initcalls_finished) return; /* Get its name now since this function could @@ -164,7 +167,7 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!pre_initcalls_finished) + if (!tr || !pre_initcalls_finished) return; sprint_symbol(bt->func, (unsigned long)fn); -- cgit v1.2.3-70-g09d2 From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 18:43:07 -0500 Subject: ring-buffer: add NMI protection for spinlocks Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 + arch/x86/kernel/ftrace.c | 8 ++++---- include/linux/ftrace_irq.h | 10 +++++++++- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 68 insertions(+), 7 deletions(-) (limited to 'kernel/trace/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f7fe8fd4d..a6be725cb04 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST + select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4d33224c055..4c683587055 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -113,7 +113,7 @@ static void ftrace_mod_code(void) MCOUNT_INSN_SIZE); } -void ftrace_nmi_enter(void) +void arch_ftrace_nmi_enter(void) { atomic_inc(&in_nmi); /* Must have in_nmi seen before reading write flag */ @@ -124,7 +124,7 @@ void ftrace_nmi_enter(void) } } -void ftrace_nmi_exit(void) +void arch_ftrace_nmi_exit(void) { /* Finish all executions before clearing in_nmi */ smp_wmb(); @@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void) */ static atomic_t in_nmi; -void ftrace_nmi_enter(void) +void arch_ftrace_nmi_enter(void) { atomic_inc(&in_nmi); } -void ftrace_nmi_exit(void) +void arch_ftrace_nmi_exit(void) { atomic_dec(&in_nmi); } diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 366a054d0b0..29de6779a96 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,15 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) +#ifdef CONFIG_FTRACE_NMI_ENTER +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + +#ifdef CONFIG_RING_BUFFER extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d..25131a5d5e4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_FTRACE_NMI_ENTER + bool + config HAVE_FUNCTION_TRACER bool @@ -37,6 +40,11 @@ config TRACER_MAX_TRACE config RING_BUFFER bool +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + config TRACING bool select DEBUG_FS diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374cee..a60a6a852f4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include @@ -18,6 +19,35 @@ #include "trace.h" +/* + * Since the write to the buffer is still not fully lockless, + * we must be careful with NMIs. The locks in the writers + * are taken when a write crosses to a new page. The locks + * protect against races with the readers (this will soon + * be fixed with a lockless solution). + * + * Because we can not protect against NMIs, and we want to + * keep traces reentrant, we need to manage what happens + * when we are in an NMI. + */ +static DEFINE_PER_CPU(int, rb_in_nmi); + +void ftrace_nmi_enter(void) +{ + __get_cpu_var(rb_in_nmi)++; + /* call arch specific handler too */ + arch_ftrace_nmi_enter(); +} + +void ftrace_nmi_exit(void) +{ + arch_ftrace_nmi_exit(); + __get_cpu_var(rb_in_nmi)--; + /* NMIs are not recursive */ + WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); +} + + /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + bool lock_taken = false; commit_page = cpu_buffer->commit_page; /* we just need to protect against interrupts */ @@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page = tail_page; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + /* + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) + goto out_unlock; + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; rb_inc_page(cpu_buffer, &next_page); @@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail <= BUF_PAGE_SIZE) local_set(&tail_page->write, tail); - __raw_spin_unlock(&cpu_buffer->lock); + if (likely(lock_taken)) + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; } -- cgit v1.2.3-70-g09d2 From 2db270a80b8f2238e536876cfb3987af02684df8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Feb 2009 20:46:45 +0100 Subject: tracing/blktrace: move the tracing file to kernel/trace Impact: cleanup Move blktrace.c to kernel/trace, also move its config entry. Signed-off-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Acked-by: Jens Axboe Signed-off-by: Ingo Molnar --- block/Kconfig | 24 - block/Makefile | 1 - block/blktrace.c | 1538 ----------------------------------------------- kernel/trace/Kconfig | 23 + kernel/trace/Makefile | 1 + kernel/trace/blktrace.c | 1538 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1562 insertions(+), 1563 deletions(-) delete mode 100644 block/blktrace.c create mode 100644 kernel/trace/blktrace.c (limited to 'kernel/trace/Kconfig') diff --git a/block/Kconfig b/block/Kconfig index 7cdaa1d7225..e7d12782bcf 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -44,30 +44,6 @@ config LBD If unsure, say N. -config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" - depends on SYSFS - select RELAY - select DEBUG_FS - select TRACEPOINTS - select TRACING - select STACKTRACE - help - Say Y here if you want to be able to trace the block layer actions - on a given queue. Tracing allows you to see any traffic happening - on a block device queue. For more information (and the userspace - support tools needed), fetch the blktrace tools from: - - git://git.kernel.dk/blktrace.git - - Tracing also is possible using the ftrace interface, e.g.: - - echo 1 > /sys/block/sda/sda1/trace/enable - echo blk > /sys/kernel/debug/tracing/current_tracer - cat /sys/kernel/debug/tracing/trace_pipe - - If unsure, say N. - config BLK_DEV_BSG bool "Block layer SG support v4 (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/block/Makefile b/block/Makefile index bfe73049f93..e9fa4dd690f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blktrace.c b/block/blktrace.c deleted file mode 100644 index ca6d32061e4..00000000000 --- a/block/blktrace.c +++ /dev/null @@ -1,1538 +0,0 @@ -/* - * Copyright (C) 2006 Jens Axboe - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include <../kernel/trace/trace_output.h> - -static unsigned int blktrace_seq __read_mostly = 1; - -static struct trace_array *blk_tr; -static int __read_mostly blk_tracer_enabled; - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_BLK_OPT_CLASSIC 0x1 - -static struct tracer_opt blk_tracer_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, - { } -}; - -static struct tracer_flags blk_tracer_flags = { - .val = 0, - .opts = blk_tracer_opts, -}; - -/* Global reference count of probes */ -static DEFINE_MUTEX(blk_probe_mutex); -static atomic_t blk_probes_ref = ATOMIC_INIT(0); - -static int blk_register_tracepoints(void); -static void blk_unregister_tracepoints(void); - -/* - * Send out a notify message. - */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) -{ - struct blk_io_trace *t; - - if (!bt->rchan) - return; - - t = relay_reserve(bt->rchan, sizeof(*t) + len); - if (t) { - const int cpu = smp_processor_id(); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); - t->device = bt->dev; - t->action = action; - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); - } -} - -/* - * Send out a notify for this process, if we haven't done so since a trace - * started - */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) -{ - tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); -} - -static void trace_note_time(struct blk_trace *bt) -{ - struct timespec now; - unsigned long flags; - u32 words[2]; - - getnstimeofday(&now); - words[0] = now.tv_sec; - words[1] = now.tv_nsec; - - local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); - local_irq_restore(flags); -} - -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) -{ - int n; - va_list args; - unsigned long flags; - char *buf; - - if (blk_tr) { - va_start(args, fmt); - ftrace_vprintk(fmt, args); - va_end(args); - return; - } - - if (!bt->msg_data) - return; - - local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); - va_start(args, fmt); - n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); - va_end(args); - - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(__trace_note_message); - -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, - pid_t pid) -{ - if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) - return 1; - if (sector < bt->start_lba || sector > bt->end_lba) - return 1; - if (bt->pid && pid != bt->pid) - return 1; - - return 0; -} - -/* - * Data direction bit lookup - */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), - BLK_TC_ACT(BLK_TC_WRITE) }; - -/* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) - -/* - * The worker for the various blk_add_trace*() types. Fills out a - * blk_io_trace structure and places it in a per-cpu subbuffer. - */ -static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) -{ - struct task_struct *tsk = current; - struct ring_buffer_event *event = NULL; - struct blk_io_trace *t; - unsigned long flags = 0; - unsigned long *sequence; - pid_t pid; - int cpu, pc = 0; - - if (unlikely(bt->trace_state != Blktrace_running || - !blk_tracer_enabled)) - return; - - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, AHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - - pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) - return; - cpu = raw_smp_processor_id(); - - if (blk_tr) { - tracing_record_cmdline(current); - - pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, - sizeof(*t) + pdu_len, - 0, pc); - if (!event) - return; - t = ring_buffer_event_data(event); - goto record_it; - } - - /* - * A word about the locking here - we disable interrupts to reserve - * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. - */ - local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len; - - if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - - if (blk_tr) { - trace_buffer_unlock_commit(blk_tr, event, 0, pc); - return; - } - } - - local_irq_restore(flags); -} - -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - -static void blk_trace_cleanup(struct blk_trace *bt) -{ - debugfs_remove(bt->msg_file); - debugfs_remove(bt->dropped_file); - relay_close(bt->rchan); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - kfree(bt); - mutex_lock(&blk_probe_mutex); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); -} - -int blk_trace_remove(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (!bt) - return -EINVAL; - - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) - blk_trace_cleanup(bt); - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_remove); - -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct blk_trace *bt = filp->private_data; - char buf[16]; - - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); - - return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); -} - -static const struct file_operations blk_dropped_fops = { - .owner = THIS_MODULE, - .open = blk_dropped_open, - .read = blk_dropped_read, -}; - -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *msg; - struct blk_trace *bt; - - if (count > BLK_TN_MAX_MSG) - return -EINVAL; - - msg = kmalloc(count, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } - - bt = filp->private_data; - __trace_note_message(bt, "%s", msg); - kfree(msg); - - return count; -} - -static const struct file_operations blk_msg_fops = { - .owner = THIS_MODULE, - .open = blk_msg_open, - .write = blk_msg_write, -}; - -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - -static int blk_remove_buf_file_callback(struct dentry *dentry) -{ - struct dentry *parent = dentry->d_parent; - debugfs_remove(dentry); - - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); - return 0; -} - -static struct dentry *blk_create_buf_file_callback(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, - .create_buf_file = blk_create_buf_file_callback, - .remove_buf_file = blk_remove_buf_file_callback, -}; - -/* - * Setup everything required to start tracing - */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) -{ - struct blk_trace *old_bt, *bt = NULL; - struct dentry *dir = NULL; - int ret, i; - - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; - - ret = -ENOMEM; - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - goto err; - - bt->sequence = alloc_percpu(unsigned long); - if (!bt->sequence) - goto err; - - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); - if (!bt->msg_data) - goto err; - - ret = -ENOENT; - - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) - return -ENOMEM; - } - - dir = debugfs_create_dir(buts->name, blk_tree_root); - - if (!dir) - goto err; - - bt->dir = dir; - bt->dev = dev; - atomic_set(&bt->dropped, 0); - - ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, - &blk_dropped_fops); - if (!bt->dropped_file) - goto err; - - bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; - - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); - if (!bt->rchan) - goto err; - - bt->act_mask = buts->act_mask; - if (!bt->act_mask) - bt->act_mask = (u16) -1; - - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) - bt->end_lba = -1ULL; - - bt->pid = buts->pid; - bt->trace_state = Blktrace_setup; - - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) { - ret = blk_register_tracepoints(); - if (ret) - goto probe_err; - } - mutex_unlock(&blk_probe_mutex); - - ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); - goto err; - } - - return 0; -probe_err: - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); -err: - if (bt) { - if (bt->msg_file) - debugfs_remove(bt->msg_file); - if (bt->dropped_file) - debugfs_remove(bt->dropped_file); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - if (bt->rchan) - relay_close(bt->rchan); - kfree(bt); - } - return ret; -} - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - char __user *arg) -{ - struct blk_user_trace_setup buts; - int ret; - - ret = copy_from_user(&buts, arg, sizeof(buts)); - if (ret) - return -EFAULT; - - ret = do_blk_trace_setup(q, name, dev, &buts); - if (ret) - return ret; - - if (copy_to_user(arg, &buts, sizeof(buts))) - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_setup); - -int blk_trace_startstop(struct request_queue *q, int start) -{ - int ret; - struct blk_trace *bt = q->blk_trace; - - if (bt == NULL) - return -EINVAL; - - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(blk_trace_startstop); - -/** - * blk_trace_ioctl: - handle the ioctls associated with tracing - * @bdev: the block device - * @cmd: the ioctl cmd - * @arg: the argument data, if any - * - **/ -int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) -{ - struct request_queue *q; - int ret, start = 0; - char b[BDEVNAME_SIZE]; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - mutex_lock(&bdev->bd_mutex); - - switch (cmd) { - case BLKTRACESETUP: - bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); - break; - case BLKTRACESTART: - start = 1; - case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); - break; - case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); - break; - default: - ret = -ENOTTY; - break; - } - - mutex_unlock(&bdev->bd_mutex); - return ret; -} - -/** - * blk_trace_shutdown: - stop and cleanup trace structures - * @q: the request queue associated with the device - * - **/ -void blk_trace_shutdown(struct request_queue *q) -{ - if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); - } -} - -/* - * blktrace probes - */ - -/** - * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for - * @rq: the source request - * @what: the action - * - * Description: - * Records an action against a request. Will log the bio offset + size. - * - **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; - - if (likely(!bt)) - return; - - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); - - if (blk_pc_request(rq)) { - what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - sizeof(rq->cmd), rq->cmd); - } else { - what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - rw, what, rq->errors, 0, NULL); - } -} - -static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ABORT); -} - -static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_INSERT); -} - -static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); -} - -static void blk_add_trace_rq_requeue(struct request_queue *q, - struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); -} - -static void blk_add_trace_rq_complete(struct request_queue *q, - struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); -} - -/** - * blk_add_trace_bio - Add a trace for a bio oriented action - * @q: queue the io is for - * @bio: the source bio - * @what: the action - * - * Description: - * Records an action against a bio. Will log the bio offset + size. - * - **/ -static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); -} - -static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); -} - -static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); -} - -static void blk_add_trace_bio_backmerge(struct request_queue *q, - struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); -} - -static void blk_add_trace_bio_frontmerge(struct request_queue *q, - struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); -} - -static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); -} - -static void blk_add_trace_getrq(struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); - } -} - - -static void blk_add_trace_sleeprq(struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, - 0, 0, NULL); - } -} - -static void blk_add_trace_plug(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); -} - -static void blk_add_trace_unplug_io(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_unplug_timer(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_split(struct request_queue *q, struct bio *bio, - unsigned int pdu) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); - } -} - -/** - * blk_add_trace_remap - Add a trace for a remap operation - * @q: queue the io is for - * @bio: the source bio - * @dev: target device - * @from: source sector - * @to: target sector - * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * - **/ -static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); - - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); -} - -/** - * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for - * @rq: io request - * @data: driver-specific data - * @len: length of driver-specific data - * - * Description: - * Some drivers might want to write driver-specific data per request. - * - **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); - else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); -} -EXPORT_SYMBOL_GPL(blk_add_driver_data); - -static int blk_register_tracepoints(void) -{ - int ret; - - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); - WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); - WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); - WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); - WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); - WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); - WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); - WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); - WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq); - WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); - WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug); - WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); - WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); - WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split); - WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap); - WARN_ON(ret); - return 0; -} - -static void blk_unregister_tracepoints(void) -{ - unregister_trace_block_remap(blk_add_trace_remap); - unregister_trace_block_split(blk_add_trace_split); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); - unregister_trace_block_plug(blk_add_trace_plug); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq); - unregister_trace_block_getrq(blk_add_trace_getrq); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort); - - tracepoint_synchronize_unregister(); -} - -/* - * struct blk_io_tracer formatting routines - */ - -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) -{ - int i = 0; - - if (t->action & BLK_TC_DISCARD) - rwbs[i++] = 'D'; - else if (t->action & BLK_TC_WRITE) - rwbs[i++] = 'W'; - else if (t->bytes) - rwbs[i++] = 'R'; - else - rwbs[i++] = 'N'; - - if (t->action & BLK_TC_AHEAD) - rwbs[i++] = 'A'; - if (t->action & BLK_TC_BARRIER) - rwbs[i++] = 'B'; - if (t->action & BLK_TC_SYNC) - rwbs[i++] = 'S'; - if (t->action & BLK_TC_META) - rwbs[i++] = 'M'; - - rwbs[i] = '\0'; -} - -static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) -{ - return (const struct blk_io_trace *)ent; -} - -static inline const void *pdu_start(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent) + 1; -} - -static inline u32 t_sec(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->bytes >> 9; -} - -static inline unsigned long long t_sector(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->sector; -} - -static inline __u16 t_error(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->sector; -} - -static __u64 get_pdu_int(const struct trace_entry *ent) -{ - const __u64 *val = pdu_start(ent); - return be64_to_cpu(*val); -} - -static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) -{ - const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; - - r->device = be32_to_cpu(__r->device); - r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); -} - -static int blk_log_action_iter(struct trace_iterator *iter, const char *act) -{ - char rwbs[6]; - unsigned long long ts = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(ts, USEC_PER_SEC); - unsigned secs = (unsigned long)ts; - const struct trace_entry *ent = iter->ent; - const struct blk_io_trace *t = (const struct blk_io_trace *)ent; - - fill_rwbs(rwbs, t); - - return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", - MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, usec_rem, ent->pid, act, rwbs); -} - -static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, - const char *act) -{ - char rwbs[6]; - fill_rwbs(rwbs, t); - return trace_seq_printf(s, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); -} - -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) -{ - const char *cmd = trace_find_cmdline(ent->pid); - - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); -} - -static int blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) -{ - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); -} - -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) -{ - struct blk_io_trace_remap r = { .device = 0, }; - - get_pdu_remap(ent, &r); - return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); -} - -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) -{ - return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); -} - -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) -{ - return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), - get_pdu_int(ent)); -} - -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) -{ - return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), trace_find_cmdline(ent->pid)); -} - -/* - * struct tracer operations - */ - -static void blk_tracer_print_header(struct seq_file *m) -{ - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) - return; - seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" - "# | | | | | |\n"); -} - -static void blk_tracer_start(struct trace_array *tr) -{ - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) - if (blk_register_tracepoints()) - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; -} - -static int blk_tracer_init(struct trace_array *tr) -{ - blk_tr = tr; - blk_tracer_start(tr); - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled++; - mutex_unlock(&blk_probe_mutex); - return 0; -} - -static void blk_tracer_stop(struct trace_array *tr) -{ - trace_flags |= TRACE_ITER_CONTEXT_INFO; - mutex_lock(&blk_probe_mutex); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); -} - -static void blk_tracer_reset(struct trace_array *tr) -{ - if (!atomic_read(&blk_probes_ref)) - return; - - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled--; - WARN_ON(blk_tracer_enabled < 0); - mutex_unlock(&blk_probe_mutex); - - blk_tracer_stop(tr); -} - -static struct { - const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); -} what2act[] __read_mostly = { - [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, - [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, - [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, - [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, - [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, - [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, - [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, - [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, - [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, - [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, - [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, - [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, - [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, - [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, -}; - -static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) -{ - struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); - int ret; - - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); - else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(s, iter->ent); - } - - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); - struct blk_io_trace old = { - .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, - .time = ns2usecs(iter->ts), - }; - - if (!trace_seq_putmem(s, &old, offset)) - return 0; - return trace_seq_putmem(s, &t->sector, - sizeof(old) - offset + t->pdu_len); -} - -static enum print_line_t -blk_trace_event_print_binary(struct trace_iterator *iter, int flags) -{ - return blk_trace_synthesize_old_trace(iter) ? - TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) -{ - const struct blk_io_trace *t; - u16 what; - int ret; - - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) - return TRACE_TYPE_UNHANDLED; - - t = (const struct blk_io_trace *)iter->ent; - what = t->action & ((1 << BLK_TC_SHIFT) - 1); - - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) - ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); - else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_iter(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(&iter->seq, iter->ent); - } - - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static struct tracer blk_tracer __read_mostly = { - .name = "blk", - .init = blk_tracer_init, - .reset = blk_tracer_reset, - .start = blk_tracer_start, - .stop = blk_tracer_stop, - .print_header = blk_tracer_print_header, - .print_line = blk_tracer_print_line, - .flags = &blk_tracer_flags, -}; - -static struct trace_event trace_blk_event = { - .type = TRACE_BLK, - .trace = blk_trace_event_print, - .latency_trace = blk_trace_event_print, - .binary = blk_trace_event_print_binary, -}; - -static int __init init_blk_tracer(void) -{ - if (!register_ftrace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); - return 1; - } - - if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); - unregister_ftrace_event(&trace_blk_event); - return 1; - } - - return 0; -} - -device_initcall(init_blk_tracer); - -static int blk_trace_remove_queue(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (bt == NULL) - return -EINVAL; - - kfree(bt); - return 0; -} - -/* - * Setup everything required to start tracing - */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) -{ - struct blk_trace *old_bt, *bt = NULL; - int ret; - - ret = -ENOMEM; - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - goto err; - - bt->dev = dev; - bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; - bt->trace_state = Blktrace_running; - - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - kfree(bt); - ret = -EBUSY; - } - return 0; -err: - return ret; -} - -/* - * sysfs interface to enable and configure tracing - */ - -static ssize_t sysfs_blk_trace_enable_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev != NULL) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q != NULL) { - mutex_lock(&bdev->bd_mutex); - ret = sprintf(buf, "%u\n", !!q->blk_trace); - mutex_unlock(&bdev->bd_mutex); - } - - bdput(bdev); - } - - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_enable_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - int value; - ssize_t ret = -ENXIO; - - if (count == 0 || sscanf(buf, "%d", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - else - ret = blk_trace_remove_queue(q); - mutex_unlock(&bdev->bd_mutex); - - if (ret == 0) - ret = count; -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} - -static ssize_t sysfs_blk_trace_attr_show(struct device *dev, - struct device_attribute *attr, - char *buf); -static ssize_t sysfs_blk_trace_attr_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count); -#define BLK_TRACE_DEVICE_ATTR(_name) \ - DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ - sysfs_blk_trace_attr_show, \ - sysfs_blk_trace_attr_store) - -static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, - sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); -static BLK_TRACE_DEVICE_ATTR(act_mask); -static BLK_TRACE_DEVICE_ATTR(pid); -static BLK_TRACE_DEVICE_ATTR(start_lba); -static BLK_TRACE_DEVICE_ATTR(end_lba); - -static struct attribute *blk_trace_attrs[] = { - &dev_attr_enable.attr, - &dev_attr_act_mask.attr, - &dev_attr_pid.attr, - &dev_attr_start_lba.attr, - &dev_attr_end_lba.attr, - NULL -}; - -struct attribute_group blk_trace_attr_group = { - .name = "trace", - .attrs = blk_trace_attrs, -}; - -static int blk_str2act_mask(const char *str) -{ - int mask = 0; - char *copy = kstrdup(str, GFP_KERNEL), *s; - - if (copy == NULL) - return -ENOMEM; - - s = strstrip(copy); - - while (1) { - char *sep = strchr(s, ','); - - if (sep != NULL) - *sep = '\0'; - - if (strcasecmp(s, "barrier") == 0) - mask |= BLK_TC_BARRIER; - else if (strcasecmp(s, "complete") == 0) - mask |= BLK_TC_COMPLETE; - else if (strcasecmp(s, "fs") == 0) - mask |= BLK_TC_FS; - else if (strcasecmp(s, "issue") == 0) - mask |= BLK_TC_ISSUE; - else if (strcasecmp(s, "pc") == 0) - mask |= BLK_TC_PC; - else if (strcasecmp(s, "queue") == 0) - mask |= BLK_TC_QUEUE; - else if (strcasecmp(s, "read") == 0) - mask |= BLK_TC_READ; - else if (strcasecmp(s, "requeue") == 0) - mask |= BLK_TC_REQUEUE; - else if (strcasecmp(s, "sync") == 0) - mask |= BLK_TC_SYNC; - else if (strcasecmp(s, "write") == 0) - mask |= BLK_TC_WRITE; - - if (sep == NULL) - break; - - s = sep + 1; - } - kfree(copy); - - return mask; -} - -static ssize_t sysfs_blk_trace_attr_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q; - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - mutex_lock(&bdev->bd_mutex); - if (q->blk_trace == NULL) - ret = sprintf(buf, "disabled\n"); - else if (attr == &dev_attr_act_mask) - ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); - else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); - else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); - else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); - mutex_unlock(&bdev->bd_mutex); -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_attr_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - u64 value; - ssize_t ret = -ENXIO; - - if (count == 0) - goto out; - - if (attr == &dev_attr_act_mask) { - if (sscanf(buf, "%llx", &value) != 1) { - /* Assume it is a list of trace category names */ - value = blk_str2act_mask(buf); - if (value < 0) - goto out; - } - } else if (sscanf(buf, "%llu", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - ret = 0; - if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - - if (ret == 0) { - if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; - else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; - else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; - else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; - ret = count; - } - mutex_unlock(&bdev->bd_mutex); -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 25131a5d5e4..4fee43c0194 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -302,6 +302,29 @@ config WORKQUEUE_TRACER For example it can help a developer to decide whether he should choose a per cpu workqueue instead of a singlethreaded one. +config BLK_DEV_IO_TRACE + bool "Support for tracing block io actions" + depends on SYSFS + select RELAY + select DEBUG_FS + select TRACEPOINTS + select TRACING + select STACKTRACE + help + Say Y here if you want to be able to trace the block layer actions + on a given queue. Tracing allows you to see any traffic happening + on a block device queue. For more information (and the userspace + support tools needed), fetch the blktrace tools from: + + git://git.kernel.dk/blktrace.git + + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/debug/tracing/current_tracer + cat /sys/kernel/debug/tracing/trace_pipe + + If unsure, say N. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f76d48f3527..627090bc262 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c new file mode 100644 index 00000000000..3b91da06482 --- /dev/null +++ b/kernel/trace/blktrace.c @@ -0,0 +1,1538 @@ +/* + * Copyright (C) 2006 Jens Axboe + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_output.h" + +static unsigned int blktrace_seq __read_mostly = 1; + +static struct trace_array *blk_tr; +static int __read_mostly blk_tracer_enabled; + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + +/* Global reference count of probes */ +static DEFINE_MUTEX(blk_probe_mutex); +static atomic_t blk_probes_ref = ATOMIC_INIT(0); + +static int blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + +/* + * Send out a notify message. + */ +static void trace_note(struct blk_trace *bt, pid_t pid, int action, + const void *data, size_t len) +{ + struct blk_io_trace *t; + + if (!bt->rchan) + return; + + t = relay_reserve(bt->rchan, sizeof(*t) + len); + if (t) { + const int cpu = smp_processor_id(); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->time = ktime_to_ns(ktime_get()); + t->device = bt->dev; + t->action = action; + t->pid = pid; + t->cpu = cpu; + t->pdu_len = len; + memcpy((void *) t + sizeof(*t), data, len); + } +} + +/* + * Send out a notify for this process, if we haven't done so since a trace + * started + */ +static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +{ + tsk->btrace_seq = blktrace_seq; + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); +} + +static void trace_note_time(struct blk_trace *bt) +{ + struct timespec now; + unsigned long flags; + u32 words[2]; + + getnstimeofday(&now); + words[0] = now.tv_sec; + words[1] = now.tv_nsec; + + local_irq_save(flags); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + local_irq_restore(flags); +} + +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + unsigned long flags; + char *buf; + + if (blk_tr) { + va_start(args, fmt); + ftrace_vprintk(fmt, args); + va_end(args); + return; + } + + if (!bt->msg_data) + return; + + local_irq_save(flags); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + va_start(args, fmt); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, + pid_t pid) +{ + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) + return 1; + if (sector < bt->start_lba || sector > bt->end_lba) + return 1; + if (bt->pid && pid != bt->pid) + return 1; + + return 0; +} + +/* + * Data direction bit lookup + */ +static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; + +/* The ilog2() calls fall out because they're constant */ +#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) + +/* + * The worker for the various blk_add_trace*() types. Fills out a + * blk_io_trace structure and places it in a per-cpu subbuffer. + */ +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + int rw, u32 what, int error, int pdu_len, void *pdu_data) +{ + struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; + pid_t pid; + int cpu, pc = 0; + + if (unlikely(bt->trace_state != Blktrace_running || + !blk_tracer_enabled)) + return; + + what |= ddir_act[rw & WRITE]; + what |= MASK_TC_BIT(rw, BARRIER); + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, AHEAD); + what |= MASK_TC_BIT(rw, META); + what |= MASK_TC_BIT(rw, DISCARD); + + pid = tsk->pid; + if (unlikely(act_log_check(bt, what, sector, pid))) + return; + cpu = raw_smp_processor_id(); + + if (blk_tr) { + tracing_record_cmdline(current); + + pc = preempt_count(); + event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + sizeof(*t) + pdu_len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + /* + * A word about the locking here - we disable interrupts to reserve + * some space in the relay per-cpu buffer, to prevent an irq + * from coming in and stepping on our toes. + */ + local_irq_save(flags); + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(bt, tsk); + + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + if (t) { + sequence = per_cpu_ptr(bt->sequence, cpu); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = ++(*sequence); + t->time = ktime_to_ns(ktime_get()); +record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = bt->dev; + t->error = error; + t->pdu_len = pdu_len; + + if (pdu_len) + memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + + if (blk_tr) { + trace_buffer_unlock_commit(blk_tr, event, 0, pc); + return; + } + } + + local_irq_restore(flags); +} + +static struct dentry *blk_tree_root; +static DEFINE_MUTEX(blk_tree_mutex); + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + debugfs_remove(bt->msg_file); + debugfs_remove(bt->dropped_file); + relay_close(bt->rchan); + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + kfree(bt); + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +int blk_trace_remove(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (!bt) + return -EINVAL; + + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) + blk_trace_cleanup(bt); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_remove); + +static int blk_dropped_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct blk_trace *bt = filp->private_data; + char buf[16]; + + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static const struct file_operations blk_dropped_fops = { + .owner = THIS_MODULE, + .open = blk_dropped_open, + .read = blk_dropped_read, +}; + +static int blk_msg_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count > BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = blk_msg_open, + .write = blk_msg_write, +}; + +/* + * Keep track of how many times we encountered a full subbuffer, to aid + * the user space app in telling how many lost events there were. + */ +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct blk_trace *bt; + + if (!relay_buf_full(buf)) + return 1; + + bt = buf->chan->private_data; + atomic_inc(&bt->dropped); + return 0; +} + +static int blk_remove_buf_file_callback(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + debugfs_remove(dentry); + + /* + * this will fail for all but the last file, but that is ok. what we + * care about is the top level buts->name directory going away, when + * the last trace file is gone. Then we don't have to rmdir() that + * manually on trace stop, so it nicely solves the issue with + * force killing of running traces. + */ + + debugfs_remove(parent); + return 0; +} + +static struct dentry *blk_create_buf_file_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static struct rchan_callbacks blk_relay_callbacks = { + .subbuf_start = blk_subbuf_start_callback, + .create_buf_file = blk_create_buf_file_callback, + .remove_buf_file = blk_remove_buf_file_callback, +}; + +/* + * Setup everything required to start tracing + */ +int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct blk_user_trace_setup *buts) +{ + struct blk_trace *old_bt, *bt = NULL; + struct dentry *dir = NULL; + int ret, i; + + if (!buts->buf_size || !buts->buf_nr) + return -EINVAL; + + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); + buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + for (i = 0; i < strlen(buts->name); i++) + if (buts->name[i] == '/') + buts->name[i] = '_'; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->sequence = alloc_percpu(unsigned long); + if (!bt->sequence) + goto err; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + + ret = -ENOENT; + + if (!blk_tree_root) { + blk_tree_root = debugfs_create_dir("block", NULL); + if (!blk_tree_root) + return -ENOMEM; + } + + dir = debugfs_create_dir(buts->name, blk_tree_root); + + if (!dir) + goto err; + + bt->dir = dir; + bt->dev = dev; + atomic_set(&bt->dropped, 0); + + ret = -EIO; + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, + &blk_dropped_fops); + if (!bt->dropped_file) + goto err; + + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + + bt->rchan = relay_open("trace", dir, buts->buf_size, + buts->buf_nr, &blk_relay_callbacks, bt); + if (!bt->rchan) + goto err; + + bt->act_mask = buts->act_mask; + if (!bt->act_mask) + bt->act_mask = (u16) -1; + + bt->start_lba = buts->start_lba; + bt->end_lba = buts->end_lba; + if (!bt->end_lba) + bt->end_lba = -1ULL; + + bt->pid = buts->pid; + bt->trace_state = Blktrace_setup; + + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) { + ret = blk_register_tracepoints(); + if (ret) + goto probe_err; + } + mutex_unlock(&blk_probe_mutex); + + ret = -EBUSY; + old_bt = xchg(&q->blk_trace, bt); + if (old_bt) { + (void) xchg(&q->blk_trace, old_bt); + goto err; + } + + return 0; +probe_err: + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); +err: + if (bt) { + if (bt->msg_file) + debugfs_remove(bt->msg_file); + if (bt->dropped_file) + debugfs_remove(bt->dropped_file); + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + if (bt->rchan) + relay_close(bt->rchan); + kfree(bt); + } + return ret; +} + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + int ret; + + ret = copy_from_user(&buts, arg, sizeof(buts)); + if (ret) + return -EFAULT; + + ret = do_blk_trace_setup(q, name, dev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts, sizeof(buts))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_setup); + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + struct blk_trace *bt = q->blk_trace; + + if (bt == NULL) + return -EINVAL; + + /* + * For starting a trace, we can transition from a setup or stopped + * trace. For stopping a trace, the state must be running + */ + ret = -EINVAL; + if (start) { + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) { + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + + trace_note_time(bt); + ret = 0; + } + } else { + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + relay_flush(bt->rchan); + ret = 0; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_startstop); + +/** + * blk_trace_ioctl: - handle the ioctls associated with tracing + * @bdev: the block device + * @cmd: the ioctl cmd + * @arg: the argument data, if any + * + **/ +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +{ + struct request_queue *q; + int ret, start = 0; + char b[BDEVNAME_SIZE]; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + mutex_lock(&bdev->bd_mutex); + + switch (cmd) { + case BLKTRACESETUP: + bdevname(bdev, b); + ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + break; + case BLKTRACESTART: + start = 1; + case BLKTRACESTOP: + ret = blk_trace_startstop(q, start); + break; + case BLKTRACETEARDOWN: + ret = blk_trace_remove(q); + break; + default: + ret = -ENOTTY; + break; + } + + mutex_unlock(&bdev->bd_mutex); + return ret; +} + +/** + * blk_trace_shutdown: - stop and cleanup trace structures + * @q: the request queue associated with the device + * + **/ +void blk_trace_shutdown(struct request_queue *q) +{ + if (q->blk_trace) { + blk_trace_startstop(q, 0); + blk_trace_remove(q); + } +} + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @q: queue the io is for + * @rq: the source request + * @what: the action + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request_queue *q, struct request *rq, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + int rw = rq->cmd_flags & 0x03; + + if (likely(!bt)) + return; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) { + what |= BLK_TC_ACT(BLK_TC_PC); + __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, + sizeof(rq->cmd), rq->cmd); + } else { + what |= BLK_TC_ACT(BLK_TC_FS); + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + rw, what, rq->errors, 0, NULL); + } +} + +static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ABORT); +} + +static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_INSERT); +} + +static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +} + +static void blk_add_trace_rq_requeue(struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +} + +static void blk_add_trace_rq_complete(struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, + !bio_flagged(bio, BIO_UPTODATE), 0, NULL); +} + +static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); +} + +static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); +} + +static void blk_add_trace_bio_backmerge(struct request_queue *q, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); +} + +static void blk_add_trace_bio_frontmerge(struct request_queue *q, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); +} + +static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); +} + +static void blk_add_trace_getrq(struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + } +} + + +static void blk_add_trace_sleeprq(struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + 0, 0, NULL); + } +} + +static void blk_add_trace_plug(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +} + +static void blk_add_trace_unplug_io(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_unplug_timer(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_split(struct request_queue *q, struct bio *bio, + unsigned int pdu) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + sizeof(rpdu), &rpdu); + } +} + +/** + * blk_add_trace_remap - Add a trace for a remap operation + * @q: queue the io is for + * @bio: the source bio + * @dev: target device + * @from: source sector + * @to: target sector + * + * Description: + * Device mapper or raid target sometimes need to split a bio because + * it spans a stripe (or similar). Add a trace for that action. + * + **/ +static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from, sector_t to) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device = cpu_to_be32(dev); + r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector = cpu_to_be64(to); + + __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (blk_pc_request(rq)) + __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, + rq->errors, len, data); + else + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + 0, BLK_TA_DRV_DATA, rq->errors, len, data); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static int blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq); + WARN_ON(ret); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug); + WARN_ON(ret); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + WARN_ON(ret); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split); + WARN_ON(ret); + ret = register_trace_block_remap(blk_add_trace_remap); + WARN_ON(ret); + return 0; +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_remap(blk_add_trace_remap); + unregister_trace_block_split(blk_add_trace_split); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); + unregister_trace_block_plug(blk_add_trace_plug); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq); + unregister_trace_block_getrq(blk_add_trace_getrq); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + + tracepoint_synchronize_unregister(); +} + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + + if (t->action & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (t->action & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (t->action & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (t->action & BLK_TC_BARRIER) + rwbs[i++] = 'B'; + if (t->action & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (t->action & BLK_TC_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent) + 1; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static __u64 get_pdu_int(const struct trace_entry *ent) +{ + const __u64 *val = pdu_start(ent); + return be64_to_cpu(*val); +} + +static void get_pdu_remap(const struct trace_entry *ent, + struct blk_io_trace_remap *r) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent); + __u64 sector = __r->sector; + + r->device = be32_to_cpu(__r->device); + r->device_from = be32_to_cpu(__r->device_from); + r->sector = be64_to_cpu(sector); +} + +static int blk_log_action_iter(struct trace_iterator *iter, const char *act) +{ + char rwbs[6]; + unsigned long long ts = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(ts, USEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct trace_entry *ent = iter->ent; + const struct blk_io_trace *t = (const struct blk_io_trace *)ent; + + fill_rwbs(rwbs, t); + + return trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, usec_rem, ent->pid, act, rwbs); +} + +static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, + const char *act) +{ + char rwbs[6]; + fill_rwbs(rwbs, t); + return trace_seq_printf(s, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *cmd = trace_find_cmdline(ent->pid); + + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); +} + +static int blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent) +{ + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); +} + +static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +{ + struct blk_io_trace_remap r = { .device = 0, }; + + get_pdu_remap(ent, &r); + return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), + t_sec(ent), MAJOR(r.device), MINOR(r.device), + (unsigned long long)r.sector); +} + +static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); +} + +static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), + get_pdu_int(ent)); +} + +static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), trace_find_cmdline(ent->pid)); +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) + if (blk_register_tracepoints()) + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled++; + mutex_unlock(&blk_probe_mutex); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + trace_flags |= TRACE_ITER_CONTEXT_INFO; + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + if (!atomic_read(&blk_probes_ref)) + return; + + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled--; + WARN_ON(blk_tracer_enabled < 0); + mutex_unlock(&blk_probe_mutex); + + blk_tracer_stop(tr); +} + +static struct { + const char *act[2]; + int (*print)(struct trace_seq *s, const struct trace_entry *ent); +} what2act[] __read_mostly = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags) +{ + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); + int ret; + + if (!trace_print_context(iter)) + return TRACE_TYPE_PARTIAL_LINE; + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(s, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(s, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = ns2usecs(iter->ts), + }; + + if (!trace_seq_putmem(s, &old, offset)) + return 0; + return trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +{ + return blk_trace_synthesize_old_trace(iter) ? + TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + const struct blk_io_trace *t; + u16 what; + int ret; + + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + t = (const struct blk_io_trace *)iter->ent; + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_iter(iter, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(&iter->seq, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .trace = blk_trace_event_print, + .latency_trace = blk_trace_event_print, + .binary = blk_trace_event_print_binary, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_ftrace_event(&trace_blk_event)) { + pr_warning("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warning("Warning: could not register the block tracer\n"); + unregister_ftrace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (bt == NULL) + return -EINVAL; + + kfree(bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +{ + struct blk_trace *old_bt, *bt = NULL; + int ret; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->dev = dev; + bt->act_mask = (u16)-1; + bt->end_lba = -1ULL; + bt->trace_state = Blktrace_running; + + old_bt = xchg(&q->blk_trace, bt); + if (old_bt != NULL) { + (void)xchg(&q->blk_trace, old_bt); + kfree(bt); + ret = -EBUSY; + } + return 0; +err: + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_enable_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev != NULL) { + struct request_queue *q = bdev_get_queue(bdev); + + if (q != NULL) { + mutex_lock(&bdev->bd_mutex); + ret = sprintf(buf, "%u\n", !!q->blk_trace); + mutex_unlock(&bdev->bd_mutex); + } + + bdput(bdev); + } + + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + int value; + ssize_t ret = -ENXIO; + + if (count == 0 || sscanf(buf, "%d", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + mutex_unlock(&bdev->bd_mutex); + + if (ret == 0) + ret = count; +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, + sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static int blk_str2act_mask(const char *str) +{ + int mask = 0; + char *copy = kstrdup(str, GFP_KERNEL), *s; + + if (copy == NULL) + return -ENOMEM; + + s = strstrip(copy); + + while (1) { + char *sep = strchr(s, ','); + + if (sep != NULL) + *sep = '\0'; + + if (strcasecmp(s, "barrier") == 0) + mask |= BLK_TC_BARRIER; + else if (strcasecmp(s, "complete") == 0) + mask |= BLK_TC_COMPLETE; + else if (strcasecmp(s, "fs") == 0) + mask |= BLK_TC_FS; + else if (strcasecmp(s, "issue") == 0) + mask |= BLK_TC_ISSUE; + else if (strcasecmp(s, "pc") == 0) + mask |= BLK_TC_PC; + else if (strcasecmp(s, "queue") == 0) + mask |= BLK_TC_QUEUE; + else if (strcasecmp(s, "read") == 0) + mask |= BLK_TC_READ; + else if (strcasecmp(s, "requeue") == 0) + mask |= BLK_TC_REQUEUE; + else if (strcasecmp(s, "sync") == 0) + mask |= BLK_TC_SYNC; + else if (strcasecmp(s, "write") == 0) + mask |= BLK_TC_WRITE; + + if (sep == NULL) + break; + + s = sep + 1; + } + kfree(copy); + + return mask; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q; + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + mutex_lock(&bdev->bd_mutex); + if (q->blk_trace == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", q->blk_trace->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + u64 value; + ssize_t ret = -ENXIO; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (sscanf(buf, "%llx", &value) != 1) { + /* Assume it is a list of trace category names */ + value = blk_str2act_mask(buf); + if (value < 0) + goto out; + } + } else if (sscanf(buf, "%llu", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + ret = 0; + if (q->blk_trace == NULL) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + q->blk_trace->act_mask = value; + else if (attr == &dev_attr_pid) + q->blk_trace->pid = value; + else if (attr == &dev_attr_start_lba) + q->blk_trace->start_lba = value; + else if (attr == &dev_attr_end_lba) + q->blk_trace->end_lba = value; + ret = count; + } + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} -- cgit v1.2.3-70-g09d2 From 1dfba05d0f1a9b4245bb242a7c17fe448811a520 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Feb 2009 12:06:54 +0100 Subject: tracing/blktrace: move the tracing file to kernel/trace, fix Impact: build fix The BLK_DEV_IO_TRACE entry used to be in block/Kconfig - which file itself was dependent on CONFIG_BLOCK. But now the entry is in kernel/trace/Kconfig - which is present even on !CONFIG_BLOCK. So add a 'depends on BLOCK' to BLK_DEV_IO_TRACE. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4fee43c0194..3a331289457 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -305,6 +305,7 @@ config WORKQUEUE_TRACER config BLK_DEV_IO_TRACE bool "Support for tracing block io actions" depends on SYSFS + depends on BLOCK select RELAY select DEBUG_FS select TRACEPOINTS -- cgit v1.2.3-70-g09d2 From b22f4858126a6aa852ad745b94f6b25dbdea708e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Feb 2009 15:49:11 +0100 Subject: tracing/sysprof: add missing tracing_{start,stop}_record_cmdline() Add the missing pair tracing_{start,stop}_record_cmdline() to record well the cmdline associated with pid. Changes in v2: - fix a build error, the sched_switch tracer is needed to record the cmdline. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + kernel/trace/trace_sysprof.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3a331289457..6ff928acd45 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,6 +134,7 @@ config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 select TRACING + select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace tool. diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 84ca9d81e74..9902c15997a 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -238,6 +238,8 @@ static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; + tracing_start_cmdline_record(); + mutex_lock(&sample_timer_lock); start_stack_timers(); tracer_enabled = 1; @@ -247,6 +249,7 @@ static int stack_trace_init(struct trace_array *tr) static void stack_trace_reset(struct trace_array *tr) { + tracing_stop_cmdline_record(); stop_stack_trace(tr); } -- cgit v1.2.3-70-g09d2 From b77e38aa240c3bd9c55c98b9f7c81541e042eae5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Feb 2009 10:21:36 -0500 Subject: tracing: add event trace infrastructure This patch creates the event tracing infrastructure of ftrace. It will create the files: /debug/tracing/available_events /debug/tracing/set_event The available_events will list the trace points that have been registered with the event tracer. set_events will allow the user to enable or disable an event hook. example: # echo sched_wakeup > /debug/tracing/set_event Will enable the sched_wakeup event (if it is registered). # echo "!sched_wakeup" >> /debug/tracing/set_event Will disable the sched_wakeup event (and only that event). # echo > /debug/tracing/set_event Will disable all events (notice the '>') # cat /debug/tracing/available_events > /debug/tracing/set_event Will enable all registered event hooks. Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 11 +- kernel/trace/Kconfig | 9 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_events.c | 280 ++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.h | 52 +++++++ 5 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/trace_events.c create mode 100644 kernel/trace/trace_events.h (limited to 'kernel/trace/Kconfig') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c61fab1dd2f..0add6b28c36 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -61,6 +61,14 @@ #define BRANCH_PROFILE() #endif +#ifdef CONFIG_EVENT_TRACER +#define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ + *(_ftrace_events) \ + VMLINUX_SYMBOL(__stop_ftrace_events) = .; +#else +#define FTRACE_EVENTS() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -81,7 +89,8 @@ *(__tracepoints) \ VMLINUX_SYMBOL(__stop___tracepoints) = .; \ LIKELY_PROFILE() \ - BRANCH_PROFILE() + BRANCH_PROFILE() \ + FTRACE_EVENTS() #define RO_DATA(align) \ . = ALIGN((align)); \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 07877f4b523..999c6a2485d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -159,6 +159,15 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. +config EVENT_TRACER + bool "Trace various events in the kernel" + depends on DEBUG_KERNEL + select TRACING + help + This tracer hooks to various trace points in the kernel + allowing the user to pick and choose which trace point they + want to trace. + config BOOT_TRACER bool "Trace boot initcalls" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 627090bc262..c7363568b1c 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -38,5 +38,6 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_EVENT_TRACER) += trace_events.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c new file mode 100644 index 00000000000..05bc80ec8d2 --- /dev/null +++ b/kernel/trace/trace_events.c @@ -0,0 +1,280 @@ +/* + * event tracer + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt + * + */ + +#include +#include +#include +#include + +#include "trace_events.h" + +void event_trace_printk(unsigned long ip, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + tracing_record_cmdline(current); + trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + va_end(ap); +} + +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call = (void *)__start_ftrace_events; + + + while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + call++; + } +} + +static int ftrace_set_clr_event(char *buf, int set) +{ + struct ftrace_event_call *call = (void *)__start_ftrace_events; + + + while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + + if (strcmp(buf, call->name) != 0) { + call++; + continue; + } + + if (set) { + /* Already set? */ + if (call->enabled) + return 0; + call->enabled = 1; + call->regfunc(); + } else { + /* Already cleared? */ + if (!call->enabled) + return 0; + call->enabled = 0; + call->unregfunc(); + } + return 0; + } + return -EINVAL; +} + +/* 128 should be much more than enough */ +#define EVENT_BUF_SIZE 127 + +static ssize_t +ftrace_event_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + size_t read = 0; + int i, set = 1; + ssize_t ret; + char *buf; + char ch; + + if (!cnt || cnt < 0) + return 0; + + ret = get_user(ch, ubuf++); + if (ret) + return ret; + read++; + cnt--; + + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + return ret; + read++; + cnt--; + } + + /* Only white space found? */ + if (isspace(ch)) { + file->f_pos += read; + ret = read; + return ret; + } + + buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (cnt > EVENT_BUF_SIZE) + cnt = EVENT_BUF_SIZE; + + i = 0; + while (cnt && !isspace(ch)) { + if (!i && ch == '!') + set = 0; + else + buf[i++] = ch; + + ret = get_user(ch, ubuf++); + if (ret) + goto out_free; + read++; + cnt--; + } + buf[i] = 0; + + file->f_pos += read; + + ret = ftrace_set_clr_event(buf, set); + if (ret) + goto out_free; + + ret = read; + + out_free: + kfree(buf); + + return ret; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_call *next = call; + + (*pos)++; + + if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + return NULL; + + m->private = ++next; + + return call; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + return t_next(m, NULL, pos); +} + +static void * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_call *next; + + (*pos)++; + + retry: + if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + return NULL; + + if (!call->enabled) { + call++; + goto retry; + } + + next = call; + m->private = ++next; + + return call; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + return s_next(m, NULL, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = v; + + seq_printf(m, "%s\n", call->name); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int +ftrace_event_seq_open(struct inode *inode, struct file *file) +{ + int ret; + const struct seq_operations *seq_ops; + + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_clear_events(); + + seq_ops = inode->i_private; + ret = seq_open(file, seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = __start_ftrace_events; + } + return ret; +} + +static const struct seq_operations show_event_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct seq_operations show_set_event_seq_ops = { + .start = s_start, + .next = s_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_set_event_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .write = ftrace_event_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int event_trace_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("available_events", 0444, d_tracer, + (void *)&show_event_seq_ops, + &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_events' entry\n"); + + entry = debugfs_create_file("set_event", 0644, d_tracer, + (void *)&show_set_event_seq_ops, + &ftrace_set_event_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_event' entry\n"); + + return 0; +} +fs_initcall(event_trace_init); diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h new file mode 100644 index 00000000000..39342f86db2 --- /dev/null +++ b/kernel/trace/trace_events.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_KERNEL_TRACE_EVENTS_H +#define _LINUX_KERNEL_TRACE_EVENTS_H + +#include +#include "trace.h" + +struct ftrace_event_call { + char *name; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); +}; + + +#undef TPFMT +#define TPFMT(fmt, args...) fmt "\n", ##args + +#undef DEFINE_TRACE_FMT +#define DEFINE_TRACE_FMT(call, proto, args, fmt) \ +static void ftrace_event_##call(proto) \ +{ \ + event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ +} \ + \ +static int ftrace_reg_event_##call(void) \ +{ \ + int ret; \ + \ + ret = register_trace_##call(ftrace_event_##call); \ + if (!ret) \ + pr_info("event trace: Could not activate trace point " \ + "probe to " #call); \ + return ret; \ +} \ + \ +static void ftrace_unreg_event_##call(void) \ +{ \ + unregister_trace_##call(ftrace_event_##call); \ +} \ + \ +static struct ftrace_event_call __used \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .regfunc = ftrace_reg_event_##call, \ + .unregfunc = ftrace_unreg_event_##call, \ +} + +void event_trace_printk(unsigned long ip, const char *fmt, ...); +extern unsigned long __start_ftrace_events[]; +extern unsigned long __stop_ftrace_events[]; + +#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */ -- cgit v1.2.3-70-g09d2 From 40ada30f9621fbd831ac2437b9a2a399aad34b00 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 21:19:55 +0100 Subject: tracing: clean up menu Clean up menu structure, introduce TRACING_SUPPORT switch that signals whether an architecture supports various instrumentation mechanisms. Signed-off-by: Ingo Molnar --- arch/Kconfig | 1 + kernel/trace/Kconfig | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel/trace/Kconfig') diff --git a/arch/Kconfig b/arch/Kconfig index 550dab22daa..a092dc77c24 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -6,6 +6,7 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE + depends on TRACING_SUPPORT select TRACING select RING_BUFFER help diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 999c6a2485d..5d733da5345 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,12 +53,22 @@ config TRACING select TRACEPOINTS select NOP_TRACER +# +# Minimum requirements an architecture has to meet for us to +# be able to offer generic tracing facilities: +# +config TRACING_SUPPORT + bool + depends on TRACE_IRQFLAGS_SUPPORT + depends on STACKTRACE_SUPPORT + +if TRACING_SUPPORT + menu "Tracers" config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - depends on DEBUG_KERNEL select FRAME_POINTER select KALLSYMS select TRACING @@ -91,7 +101,6 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME - depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -114,7 +123,6 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -142,7 +150,6 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -152,7 +159,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on DEBUG_KERNEL select TRACING select MARKERS help @@ -161,7 +167,6 @@ config CONTEXT_SWITCH_TRACER config EVENT_TRACER bool "Trace various events in the kernel" - depends on DEBUG_KERNEL select TRACING help This tracer hooks to various trace points in the kernel @@ -170,7 +175,6 @@ config EVENT_TRACER config BOOT_TRACER bool "Trace boot initcalls" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER help @@ -188,7 +192,6 @@ config BOOT_TRACER config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" - depends on DEBUG_KERNEL select TRACING help This tracer profiles all the the likely and unlikely macros @@ -241,7 +244,6 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" - depends on DEBUG_KERNEL depends on X86 select TRACING help @@ -253,7 +255,6 @@ config POWER_TRACER config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER - depends on DEBUG_KERNEL select FUNCTION_TRACER select STACKTRACE select KALLSYMS @@ -343,7 +344,6 @@ config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE - depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -369,7 +369,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING && DEBUG_KERNEL + depends on TRACING select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -379,7 +379,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" - depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + depends on HAVE_MMIOTRACE_SUPPORT && PCI select TRACING help Mmiotrace traces Memory Mapped I/O access and is meant for @@ -401,3 +401,6 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. endmenu + +endif # TRACING_SUPPORT + -- cgit v1.2.3-70-g09d2 From 422d3c7a577b15e1384c9d4e72a9540896b685fa Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 6 Mar 2009 10:40:53 +0900 Subject: tracing: current tip/master can't enable ftrace After commit 40ada30f9621fbd831ac2437b9a2a399aad34b00, "make menuconfig" doesn't display "Tracer" item. Following modification restores it. --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d733da5345..058d949a321 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -61,6 +61,7 @@ config TRACING_SUPPORT bool depends on TRACE_IRQFLAGS_SUPPORT depends on STACKTRACE_SUPPORT + default y if TRACING_SUPPORT -- cgit v1.2.3-70-g09d2 From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 6 Mar 2009 17:21:47 +0100 Subject: tracing: infrastructure for supporting binary record Impact: save on memory for tracing Current tracers are typically using a struct(like struct ftrace_entry, struct ctx_switch_entry, struct special_entr etc...)to record a binary event. These structs can only record a their own kind of events. A new kind of tracer need a new struct and a lot of code too handle it. So we need a generic binary record for events. This infrastructure is for this purpose. [fweisbec@gmail.com: rebase against latest -tip, make it safe while sched tracing as reported by Steven Rostedt] Signed-off-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 3 ++ kernel/trace/Kconfig | 6 +++ kernel/trace/Makefile | 1 + kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++ kernel/trace/trace.h | 12 ++++++ kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.c | 75 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 240 insertions(+) create mode 100644 kernel/trace/trace_bprintk.c (limited to 'kernel/trace/Kconfig') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 498769425eb..1c9cdca0258 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,6 +223,9 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); +#ifdef CONFIG_TRACE_BPRINTK +extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +#endif /* May be defined in arch */ extern int ftrace_arch_read_dyn_info(char *buf, int size); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 058d949a321..ad8d3617d0a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. +config TRACE_BPRINTK + bool "Binary printk for tracing" + default y + depends on TRACING + select BINARY_PRINTF + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f44736c7574..46557ef4c37 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o +obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6144acf2b7..ff53509e19f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +/** + * trace_vbprintk - write binary msg to tracing buffer + * + * Caller must insure @fmt are valid when msg is in tracing buffer. + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + static DEFINE_SPINLOCK(trace_buf_lock); + static u32 trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + struct bprintk_entry *entry; + unsigned long flags; + int resched; + int cpu, len = 0, size, pc; + + if (tracing_disabled || !trace_bprintk_enable) + return 0; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(atomic_read(&data->disabled))) + goto out; + + spin_lock_irqsave(&trace_buf_lock, flags); + len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + if (len > TRACE_BUF_SIZE || len < 0) + goto out_unlock; + + size = sizeof(*entry) + sizeof(u32) * len; + event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, trace_buf, sizeof(u32) * len); + ring_buffer_unlock_commit(tr->buffer, event); + +out_unlock: + spin_unlock_irqrestore(&trace_buf_lock, flags); + +out: + ftrace_preempt_enable(resched); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8beff03fda6..0f5077f8f95 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -20,6 +20,7 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_PRINT, + TRACE_BPRINTK, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -124,6 +125,16 @@ struct print_entry { char buf[]; }; +struct bprintk_entry { + struct trace_entry ent; + unsigned long ip; + const char *fmt; + u32 buf[]; +}; +#ifdef CONFIG_TRACE_BPRINTK +extern int trace_bprintk_enable; +#endif + #define TRACE_OLD_SIZE 88 struct trace_field_cont { @@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c new file mode 100644 index 00000000000..1f8e532c3fb --- /dev/null +++ b/kernel/trace/trace_bprintk.c @@ -0,0 +1,87 @@ +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +/* binary printk basic */ +static DEFINE_MUTEX(btrace_mutex); +static int btrace_metadata_count; + +static inline void lock_btrace(void) +{ + mutex_lock(&btrace_mutex); +} + +static inline void unlock_btrace(void) +{ + mutex_unlock(&btrace_mutex); +} + +static void get_btrace_metadata(void) +{ + lock_btrace(); + btrace_metadata_count++; + unlock_btrace(); +} + +static void put_btrace_metadata(void) +{ + lock_btrace(); + btrace_metadata_count--; + unlock_btrace(); +} + +/* events tracer */ +int trace_bprintk_enable; + +static void start_bprintk_trace(struct trace_array *tr) +{ + get_btrace_metadata(); + tracing_reset_online_cpus(tr); + trace_bprintk_enable = 1; +} + +static void stop_bprintk_trace(struct trace_array *tr) +{ + trace_bprintk_enable = 0; + tracing_reset_online_cpus(tr); + put_btrace_metadata(); +} + +static int init_bprintk_trace(struct trace_array *tr) +{ + start_bprintk_trace(tr); + return 0; +} + +static struct tracer bprintk_trace __read_mostly = +{ + .name = "events", + .init = init_bprintk_trace, + .reset = stop_bprintk_trace, + .start = start_bprintk_trace, + .stop = stop_bprintk_trace, +}; + +static __init int init_bprintk(void) +{ + return register_tracer(&bprintk_trace); +} + +device_initcall(init_bprintk); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 306fef84c50..4ab71201862 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +static int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (!len) + return 0; + + ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + /** * trace_seq_puts - trace sequence printing of simple string * @s: trace sequence descriptor @@ -855,6 +875,60 @@ static struct trace_event trace_print_event = { .raw = trace_print_raw, }; +/* TRACE_BPRINTK */ +static enum print_line_t +trace_bprintk_print(struct trace_iterator *iter, int flags) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprintk_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_bprintf(s, field->fmt, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t +trace_bprintk_raw(struct trace_iterator *iter, int flags) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprintk_entry *field; + + trace_assign_type(field, entry); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_bprintf(s, field->fmt, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_bprintk_event = { + .type = TRACE_BPRINTK, + .trace = trace_bprintk_print, + .raw = trace_bprintk_raw, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, @@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = { &trace_stack_event, &trace_user_stack_event, &trace_print_event, + &trace_bprintk_event, NULL }; -- cgit v1.2.3-70-g09d2 From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Mar 2009 17:21:49 +0100 Subject: tracing/core: drop the old trace_printk() implementation in favour of trace_bprintk() Impact: faster and lighter tracing Now that we have trace_bprintk() which is faster and consume lesser memory than trace_printk() and has the same purpose, we can now drop the old implementation in favour of the binary one from trace_bprintk(), which means we move all the implementation of trace_bprintk() to trace_printk(), so the Api doesn't change except that we must now use trace_seq_bprintk() to print the TRACE_PRINT entries. Some changes result of this: - Previously, trace_bprintk depended of a single tracer and couldn't work without. This tracer has been dropped and the whole implementation of trace_printk() (like the module formats management) is now integrated in the tracing core (comes with CONFIG_TRACING), though we keep the file trace_printk (previously trace_bprintk.c) where we can find the module management. Thus we don't overflow trace.c - changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries. - change a bit trace_printk/trace_vprintk macros to support non-builtin formats constants, and fix 'const' qualifiers warnings. But this is all transparent for developers. - etc... V2: - Rebase against last changes - Fix mispell on the changelog V3: - Rebase against last changes (moving trace_printk() to kernel.h) Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 25 ----- include/linux/kernel.h | 34 +++++- include/linux/module.h | 2 +- kernel/trace/Kconfig | 7 +- kernel/trace/Makefile | 2 +- kernel/trace/trace.c | 212 ++++++++++------------------------- kernel/trace/trace.h | 14 +-- kernel/trace/trace_bprintk.c | 154 ------------------------- kernel/trace/trace_functions_graph.c | 6 +- kernel/trace/trace_mmiotrace.c | 9 +- kernel/trace/trace_output.c | 70 ++---------- kernel/trace/trace_output.h | 2 + kernel/trace/trace_printk.c | 138 +++++++++++++++++++++++ 13 files changed, 262 insertions(+), 413 deletions(-) delete mode 100644 kernel/trace/trace_bprintk.c create mode 100644 kernel/trace/trace_printk.c (limited to 'kernel/trace/Kconfig') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1cc8ca453a9..e1583f2639b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,31 +223,6 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); -#ifdef CONFIG_TRACE_BPRINTK -extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int __trace_bprintk(unsigned long ip, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); - -static inline void ____trace_bprintk_check_format(const char *fmt, ...) - __attribute__ ((format (printf, 1, 2))); -static inline void ____trace_bprintk_check_format(const char *fmt, ...) {} -#define __trace_bprintk_check_format(fmt, args...) \ -do { \ - if (0) \ - ____trace_bprintk_check_format(fmt, ##args); \ -} while (0) - -#define trace_bprintk(fmt, args...) \ -do { \ - static char *__attribute__((section("__trace_bprintk_fmt"))) \ - trace_bprintk_fmt = fmt; \ - __trace_bprintk_check_format(fmt, ##args); \ - __trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args); \ -} while (0) -#else -#define trace_bprintk trace_printk -#endif - /* May be defined in arch */ extern int ftrace_arch_read_dyn_info(char *buf, int size); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7aef15c4645..4e726b9a71e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -423,6 +423,16 @@ extern void ftrace_off_permanent(void); extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); +static inline void __attribute__ ((format (printf, 1, 2))) +____trace_printk_check_format(const char *fmt, ...) +{ +} +#define __trace_printk_check_format(fmt, args...) \ +do { \ + if (0) \ + ____trace_printk_check_format(fmt, ##args); \ +} while (0) + /** * trace_printk - printf formatting in the ftrace buffer * @fmt: the printf format for printing @@ -439,13 +449,31 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); * Please refrain from leaving trace_printks scattered around in * your code. */ -# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt) + +#define trace_printk(fmt, args...) \ +do { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))); \ + trace_printk_fmt = fmt; \ + __trace_printk_check_format(fmt, ##args); \ + __trace_printk(_THIS_IP_, trace_printk_fmt, ##args); \ +} while (0) + extern int __trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap) + +#define ftrace_vprintk(fmt, vargs) \ +do { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))); \ + trace_printk_fmt = fmt; \ + __ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs); \ +} while (0) + extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); + extern void ftrace_dump(void); #else static inline void @@ -467,7 +495,7 @@ ftrace_vprintk(const char *fmt, va_list ap) return 0; } static inline void ftrace_dump(void) { } -#endif +#endif /* CONFIG_TRACING */ /* * Display an IP address in readable format. diff --git a/include/linux/module.h b/include/linux/module.h index 8cbec972d8e..22d9878e868 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -329,7 +329,7 @@ struct module unsigned int num_tracepoints; #endif -#ifdef CONFIG_TRACE_BPRINTK +#ifdef CONFIG_TRACING const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; #endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ad8d3617d0a..8e4a2a61cd7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -52,6 +52,7 @@ config TRACING select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS select NOP_TRACER + select BINARY_PRINTF # # Minimum requirements an architecture has to meet for us to @@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. -config TRACE_BPRINTK - bool "Binary printk for tracing" - default y - depends on TRACING - select BINARY_PRINTF - config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 46557ef4c37..c7a2943796e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o -obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o +obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46b3cd7a575..cc94f864248 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/** + * trace_vprintk - write binary msg to tracing buffer + * + */ +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +{ + static DEFINE_SPINLOCK(trace_buf_lock); + static u32 trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + struct print_entry *entry; + unsigned long flags; + int resched; + int cpu, len = 0, size, pc; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(atomic_read(&data->disabled))) + goto out; + + spin_lock_irqsave(&trace_buf_lock, flags); + len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + if (len > TRACE_BUF_SIZE || len < 0) + goto out_unlock; + + size = sizeof(*entry) + sizeof(u32) * len; + event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->depth = depth; + entry->fmt = fmt; + + memcpy(entry->buf, trace_buf, sizeof(u32) * len); + ring_buffer_unlock_commit(tr->buffer, event); + +out_unlock: + spin_unlock_irqrestore(&trace_buf_lock, flags); + +out: + ftrace_preempt_enable(resched); + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vprintk); + enum trace_file_type { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, @@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%s", field->buf); + ret = trace_seq_bprintf(s, field->fmt, field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void) return 0; } -int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) -{ - static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; - static char trace_buf[TRACE_BUF_SIZE]; - - struct ring_buffer_event *event; - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - int cpu, len = 0, size, pc; - struct print_entry *entry; - unsigned long irq_flags; - - if (tracing_disabled || tracing_selftest_running) - return 0; - - pc = preempt_count(); - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (unlikely(atomic_read(&data->disabled))) - goto out; - - pause_graph_tracing(); - raw_local_irq_save(irq_flags); - __raw_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - len = min(len, TRACE_BUF_SIZE-1); - trace_buf[len] = 0; - - size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->depth = depth; - - memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = 0; - ring_buffer_unlock_commit(tr->buffer, event); - - out_unlock: - __raw_spin_unlock(&trace_buf_lock); - raw_local_irq_restore(irq_flags); - unpause_graph_tracing(); - out: - preempt_enable_notrace(); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vprintk); - -int __trace_printk(unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_printk); - -int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) -{ - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); -} -EXPORT_SYMBOL_GPL(__ftrace_vprintk); - -/** - * trace_vbprintk - write binary msg to tracing buffer - * - * Caller must insure @fmt are valid when msg is in tracing buffer. - */ -int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) -{ - static DEFINE_SPINLOCK(trace_buf_lock); - static u32 trace_buf[TRACE_BUF_SIZE]; - - struct ring_buffer_event *event; - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - struct bprintk_entry *entry; - unsigned long flags; - int resched; - int cpu, len = 0, size, pc; - - if (tracing_disabled || !trace_bprintk_enable) - return 0; - - pc = preempt_count(); - resched = ftrace_preempt_disable(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (unlikely(atomic_read(&data->disabled))) - goto out; - - spin_lock_irqsave(&trace_buf_lock, flags); - len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - if (len > TRACE_BUF_SIZE || len < 0) - goto out_unlock; - - size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, trace_buf, sizeof(u32) * len); - ring_buffer_unlock_commit(tr->buffer, event); - -out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, flags); - -out: - ftrace_preempt_enable(resched); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vbprintk); - -int __trace_bprintk(unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!fmt) - return 0; - - va_start(ap, fmt); - ret = trace_vbprintk(ip, fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_bprintk); - static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0f5077f8f95..6140922392c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -20,7 +20,6 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_PRINT, - TRACE_BPRINTK, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -120,16 +119,10 @@ struct userstack_entry { */ struct print_entry { struct trace_entry ent; - unsigned long ip; + unsigned long ip; int depth; - char buf[]; -}; - -struct bprintk_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; + const char *fmt; + u32 buf[]; }; #ifdef CONFIG_TRACE_BPRINTK extern int trace_bprintk_enable; @@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ - IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c deleted file mode 100644 index f4c245a5cd3..00000000000 --- a/kernel/trace/trace_bprintk.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * trace binary printk - * - * Copyright (C) 2008 Lai Jiangshan - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -#ifdef CONFIG_MODULES - -/* binary printk basic */ -static DEFINE_MUTEX(btrace_mutex); -/* - * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt - * which are queued on trace_bprintk_fmt_list. - */ -static LIST_HEAD(trace_bprintk_fmt_list); - -struct trace_bprintk_fmt { - struct list_head list; - char fmt[0]; -}; - - -static inline void lock_btrace(void) -{ - mutex_lock(&btrace_mutex); -} - -static inline void unlock_btrace(void) -{ - mutex_unlock(&btrace_mutex); -} - - -static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) -{ - struct trace_bprintk_fmt *pos; - list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { - if (!strcmp(pos->fmt, fmt)) - return pos; - } - return NULL; -} - -static -void hold_module_trace_bprintk_format(const char **start, const char **end) -{ - const char **iter; - lock_btrace(); - for (iter = start; iter < end; iter++) { - struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); - if (tb_fmt) { - *iter = tb_fmt->fmt; - continue; - } - - tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) - + strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(tb_fmt->fmt, *iter); - *iter = tb_fmt->fmt; - } else - *iter = NULL; - } - unlock_btrace(); -} - -static int module_trace_bprintk_format_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - if (mod->num_trace_bprintk_fmt) { - const char **start = mod->trace_bprintk_fmt_start; - const char **end = start + mod->num_trace_bprintk_fmt; - - if (val == MODULE_STATE_COMING) - hold_module_trace_bprintk_format(start, end); - } - return 0; -} - -#else /* !CONFIG_MODULES */ -__init static int -module_trace_bprintk_format_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -#endif /* CONFIG_MODULES */ - - -__initdata_or_module static -struct notifier_block module_trace_bprintk_format_nb = { - .notifier_call = module_trace_bprintk_format_notify, -}; - -/* events tracer */ -int trace_bprintk_enable; - -static void start_bprintk_trace(struct trace_array *tr) -{ - tracing_reset_online_cpus(tr); - trace_bprintk_enable = 1; -} - -static void stop_bprintk_trace(struct trace_array *tr) -{ - trace_bprintk_enable = 0; - tracing_reset_online_cpus(tr); -} - -static int init_bprintk_trace(struct trace_array *tr) -{ - start_bprintk_trace(tr); - return 0; -} - -static struct tracer bprintk_trace __read_mostly = -{ - .name = "events", - .init = init_bprintk_trace, - .reset = stop_bprintk_trace, - .start = start_bprintk_trace, - .stop = stop_bprintk_trace, -}; - -static __init int init_bprintk(void) -{ - int ret = register_module_notifier(&module_trace_bprintk_format_nb); - if (ret) - return ret; - - ret = register_tracer(&bprintk_trace); - if (ret) - unregister_module_notifier(&module_trace_bprintk_format_nb); - return ret; -} - -device_initcall(init_bprintk); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e527f2f66c7..453ebd3b636 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, } /* The comment */ - ret = trace_seq_printf(s, "/* %s", trace->buf); + ret = trace_seq_printf(s, "/* "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_bprintf(s, trace->fmt, trace->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c401b908e80..23e346a734c 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct print_entry *print = (struct print_entry *)entry; - const char *msg = print->buf; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); + ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_bprintf(s, print->fmt, print->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4ab71201862..ef8fd661b21 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } -static int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { int len = (PAGE_SIZE - 1) - s->len; int ret; @@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = { }; /* TRACE_PRINT */ -static enum print_line_t trace_print_print(struct trace_iterator *iter, - int flags) -{ - struct print_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_printf(s, ": %s", field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) -{ - struct print_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static struct trace_event trace_print_event = { - .type = TRACE_PRINT, - .trace = trace_print_print, - .raw = trace_print_raw, -}; - -/* TRACE_BPRINTK */ static enum print_line_t -trace_bprintk_print(struct trace_iterator *iter, int flags) +trace_print_print(struct trace_iterator *iter, int flags) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; - struct bprintk_entry *field; + struct print_entry *field; trace_assign_type(field, entry); @@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t -trace_bprintk_raw(struct trace_iterator *iter, int flags) + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) { - struct trace_entry *entry = iter->ent; + struct print_entry *field; struct trace_seq *s = &iter->seq; - struct bprintk_entry *field; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!trace_seq_printf(s, ": %lx : ", field->ip)) goto partial; @@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_bprintk_event = { - .type = TRACE_BPRINTK, - .trace = trace_bprintk_print, - .raw = trace_bprintk_raw, - .hex = trace_nop_print, - .binary = trace_nop_print, + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .trace = trace_print_print, + .raw = trace_print_raw, }; static struct trace_event *events[] __initdata = { @@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = { &trace_stack_event, &trace_user_stack_event, &trace_print_event, - &trace_bprintk_event, NULL }; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 8a34d688ed6..3b90e6ade1a 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -18,6 +18,8 @@ struct trace_event { extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c new file mode 100644 index 00000000000..a50aea22e92 --- /dev/null +++ b/kernel/trace/trace_printk.c @@ -0,0 +1,138 @@ +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#ifdef CONFIG_MODULES + +/* + * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt + * which are queued on trace_bprintk_fmt_list. + */ +static LIST_HEAD(trace_bprintk_fmt_list); + +/* serialize accesses to trace_bprintk_fmt_list */ +static DEFINE_MUTEX(btrace_mutex); + +struct trace_bprintk_fmt { + struct list_head list; + char fmt[0]; +}; + +static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) +{ + struct trace_bprintk_fmt *pos; + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { + if (!strcmp(pos->fmt, fmt)) + return pos; + } + return NULL; +} + +static +void hold_module_trace_bprintk_format(const char **start, const char **end) +{ + const char **iter; + + mutex_lock(&btrace_mutex); + for (iter = start; iter < end; iter++) { + struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); + if (tb_fmt) { + *iter = tb_fmt->fmt; + continue; + } + + tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) + + strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(tb_fmt->fmt, *iter); + *iter = tb_fmt->fmt; + } else + *iter = NULL; + } + mutex_unlock(&btrace_mutex); +} + +static int module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + if (mod->num_trace_bprintk_fmt) { + const char **start = mod->trace_bprintk_fmt_start; + const char **end = start + mod->num_trace_bprintk_fmt; + + if (val == MODULE_STATE_COMING) + hold_module_trace_bprintk_format(start, end); + } + return 0; +} + +#else /* !CONFIG_MODULES */ +__init static int +module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + + +__initdata_or_module static +struct notifier_block module_trace_bprintk_format_nb = { + .notifier_call = module_trace_bprintk_format_notify, +}; + +int __trace_printk(unsigned long ip, const char *fmt, ...) + { + int ret; + va_list ap; + + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_printk); + +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) + { + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + + +static __init int init_trace_printk(void) +{ + return register_module_notifier(&module_trace_bprintk_format_nb); +} + +early_initcall(init_trace_printk); -- cgit v1.2.3-70-g09d2 From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Mar 2009 05:52:59 +0100 Subject: tracing/ftrace: syscall tracing infrastructure, basics Provide basic callbacks to do syscall tracing. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com> [ simplified it to a trace_printk() for now. ] Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 21 ++++++++ kernel/trace/Kconfig | 10 ++++ kernel/trace/Makefile | 1 + kernel/trace/trace.h | 2 + kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 147 insertions(+) create mode 100644 kernel/trace/trace_syscalls.c (limited to 'kernel/trace/Kconfig') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e1583f2639b..c146c1021a2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -503,4 +503,25 @@ static inline void trace_hw_branch_oops(void) {} #endif /* CONFIG_HW_BRANCH_TRACER */ +/* + * A syscall entry in the ftrace syscalls array. + * + * @syscall_nr: syscall number + */ +struct syscall_trace_entry { + int syscall_nr; +}; + +#ifdef CONFIG_FTRACE_SYSCALLS +extern void start_ftrace_syscalls(void); +extern void stop_ftrace_syscalls(void); +extern void ftrace_syscall_enter(struct pt_regs *regs); +extern void ftrace_syscall_exit(struct pt_regs *regs); +#else +static inline void start_ftrace_syscalls(void) { } +static inline void stop_ftrace_syscalls(void) { } +static inline void ftrace_syscall_enter(struct pt_regs *regs) { } +static inline void ftrace_syscall_exit(struct pt_regs *regs) { } +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8e4a2a61cd7..95a0ad191f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD config HAVE_HW_BRANCH_TRACER bool +config HAVE_FTRACE_SYSCALLS + bool + config TRACER_MAX_TRACE bool @@ -175,6 +178,13 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. +config FTRACE_SYSCALLS + bool "Trace syscalls" + depends on HAVE_FTRACE_SYSCALLS + select TRACING + help + Basic tracer to catch the syscall entry and exit events. + config BOOT_TRACER bool "Trace boot initcalls" select TRACING diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c7a2943796e..c3feea01c3e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACER) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c5e1d8865fe..3d49daae47d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -30,6 +30,8 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, + TRACE_SYSCALL_ENTER, + TRACE_SYSCALL_EXIT, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c new file mode 100644 index 00000000000..66cf97449af --- /dev/null +++ b/kernel/trace/trace_syscalls.c @@ -0,0 +1,113 @@ +#include +#include + +#include + +#include "trace_output.h" +#include "trace.h" + +static atomic_t refcount; + +void start_ftrace_syscalls(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + if (atomic_inc_return(&refcount) != 1) + goto out; + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + + read_unlock_irqrestore(&tasklist_lock, flags); +out: + atomic_dec(&refcount); +} + +void stop_ftrace_syscalls(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + if (atomic_dec_return(&refcount)) + goto out; + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + + read_unlock_irqrestore(&tasklist_lock, flags); +out: + atomic_inc(&refcount); +} + +void ftrace_syscall_enter(struct pt_regs *regs) +{ + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + + trace_printk("syscall %d enter\n", syscall_nr); +} + +void ftrace_syscall_exit(struct pt_regs *regs) +{ + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + + trace_printk("syscall %d exit\n", syscall_nr); +} + +static int init_syscall_tracer(struct trace_array *tr) +{ + start_ftrace_syscalls(); + + return 0; +} + +static void reset_syscall_tracer(struct trace_array *tr) +{ + stop_ftrace_syscalls(); +} + +static struct trace_event syscall_enter_event = { + .type = TRACE_SYSCALL_ENTER, +}; + +static struct trace_event syscall_exit_event = { + .type = TRACE_SYSCALL_EXIT, +}; + +static struct tracer syscall_tracer __read_mostly = { + .name = "syscall", + .init = init_syscall_tracer, + .reset = reset_syscall_tracer +}; + +__init int register_ftrace_syscalls(void) +{ + int ret; + + ret = register_ftrace_event(&syscall_enter_event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + syscall_enter_event.type); + WARN_ON_ONCE(1); + } + + ret = register_ftrace_event(&syscall_exit_event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + syscall_exit_event.type); + WARN_ON_ONCE(1); + } + + return register_tracer(&syscall_tracer); +} +device_initcall(register_ftrace_syscalls); -- cgit v1.2.3-70-g09d2 From 0ea1c4156bf9e2eb370cc5c6fa6eb112bd844dec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 15 Mar 2009 22:10:38 +0100 Subject: tracing/syscalls: select kallsysms Syscall tracing must select kallsysms. The arch code builds a table to find the syscall metadata by syscall number. It needs the syscalls names resolution from the symbol table to know which name found on the syscalls metadatas match a function pointer from the arch sys_call_table. Reported-by: Andrew Morton Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1237151439-6755-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95a0ad191f1..b0a46f88965 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -182,6 +182,7 @@ config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS select TRACING + select KALLSYMS help Basic tracer to catch the syscall entry and exit events. -- cgit v1.2.3-70-g09d2 From 45b9560895b07a4a09d55d49235c984db512c5aa Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 24 Mar 2009 01:07:24 +0300 Subject: tracing: Fix TRACING_SUPPORT dependency for PPC32 commit 40ada30f9621fbd831ac2437b9a2a399aa ("tracing: clean up menu"), despite the "clean up" in its purpose, introduced a behavioural change for Kconfig symbols: we no longer able to select tracing support on PPC32 (because IRQFLAGS_SUPPORT isn't yet implemented). The IRQFLAGS_SUPPORT is not mandatory for most tracers, tracing core has a special case for platforms w/o irqflags (which, by the way, has become useless as of the commit above). Though according to Ingo Molnar, there was periodic build failures on weird, unmaintained architectures that had no irqflags-tracing support and hence didn't know the raw_irqs_save/restore primitives. Thus we'd better not enable irqflags-less tracing for all architectures. This patch restores the old behaviour for PPC32, and thus brings the tracing back. Other architectures can either add themselves to the exception list or (better) implement TRACE_IRQFLAGS_SUPPORT. Signed-off-by: Anton Vorontsov Acked-b: Steven Rostedt Cc: linuxppc-dev@ozlabs.org LKML-Reference: <20090323220724.GA9851@oksana.dev.rtsoft.ru> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/trace/Kconfig') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b0a46f88965..8a4d7293104 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -63,7 +63,11 @@ config TRACING # config TRACING_SUPPORT bool - depends on TRACE_IRQFLAGS_SUPPORT + # PPC32 has no irqflags tracing support, but it can use most of the + # tracers anyway, they were tested to build and work. Note that new + # exceptions to this list aren't welcomed, better implement the + # irqflags tracing for your architecture. + depends on TRACE_IRQFLAGS_SUPPORT || PPC32 depends on STACKTRACE_SUPPORT default y -- cgit v1.2.3-70-g09d2