From ec156764d424dd67283c2cd5e9f6f1b8388364ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Import schedbench.c Import the schedbench.c tool that i wrote some time ago to simulate scheduler behavior but never finished. It's a good basis for perf sched nevertheless. Most of its guts are not hooked up to the perf event loop yet - that will be done in the patches to come. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/util/trace-event-parse.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools/perf/util/trace-event-parse.c') diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 629e602d940..16cf2d51c4e 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -1799,7 +1799,7 @@ static int get_common_info(const char *type, int *offset, int *size) return 0; } -static int parse_common_type(void *data) +int trace_parse_common_type(void *data) { static int type_offset; static int type_size; @@ -1832,7 +1832,7 @@ static int parse_common_pid(void *data) return read_size(data + pid_offset, pid_size); } -static struct event *find_event(int id) +struct event *trace_find_event(int id) { struct event *event; @@ -2420,8 +2420,8 @@ get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func, int type; int pid; - type = parse_common_type(next->data); - event = find_event(type); + type = trace_parse_common_type(next->data); + event = trace_find_event(type); if (!event) return NULL; @@ -2502,8 +2502,8 @@ print_graph_entry_leaf(struct event *event, void *data, struct record *ret_rec) int type; int i; - type = parse_common_type(ret_rec->data); - ret_event = find_event(type); + type = trace_parse_common_type(ret_rec->data); + ret_event = trace_find_event(type); field = find_field(ret_event, "rettime"); if (!field) @@ -2696,9 +2696,9 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs, nsecs -= secs * NSECS_PER_SEC; usecs = nsecs / NSECS_PER_USEC; - type = parse_common_type(data); + type = trace_parse_common_type(data); - event = find_event(type); + event = trace_find_event(type); if (!event) die("ug! no event found for type %d", type); -- cgit v1.2.3-70-g09d2 From 46538818023e8ea94f656acfa1e38297e2df20e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 12 Sep 2009 02:43:45 +0200 Subject: perf sched: Fix bad event alignment perf sched raises the following error when it meets a sched switch event: perf: builtin-sched.c:286: register_pid: Assertion `!(pid >= 65536)' failed. Abandon Currently in x86-64, the sched switch events have a hole in the middle of the structure: u16 common_type; u8 common_flags; u8 common_preempt_count; u32 common_pid; u32 common_tgid; char prev_comm[16]; u32 prev_pid; u32 prev_prio; <--- there u64 prev_state; char next_comm[16]; u32 next_pid; u32 next_prio; Gcc inserts a 4 bytes hole there for prev_state to be u64 aligned. And the events are exported to userspace with this hole. But in userspace, from perf sched, we fetch it using a structure that has a new field in the beginning: u32 size. This is because our trace is exported with its size as a field. But now that we have this new field, the hole in the middle disappears because it makes prev_state becoming well aligned. And since we are using a pointer to the raw trace using this struct, instead of reading prev_state, we are reading the hole. We could fix it by keeping the size seperate from the struct but actually there a lot of other potential problems: some fields may be saved as long in a 64 bits system and later read as long in a 32 bits system. Also this direct cast doesn't care about the endianness differences between the host traced machine and the machine in which we do the post processing. So instead of using such dangerous direct casts, fetch the values using the trace parsing API that already takes care of all these problems. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 101 +++++++++++++++++++++++++++--------- tools/perf/util/trace-event-parse.c | 23 ++++++++ tools/perf/util/trace-event.h | 3 ++ 3 files changed, 102 insertions(+), 25 deletions(-) (limited to 'tools/perf/util/trace-event-parse.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index de93a260452..0215936696e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -653,6 +653,30 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) return 0; } + +struct raw_event_sample { + u32 size; + char data[0]; +}; + +#define FILL_FIELD(ptr, field, event, data) \ + ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data) + +#define FILL_ARRAY(ptr, array, event, data) \ +do { \ + void *__array = raw_field_ptr(event, #array, data); \ + memcpy(ptr.array, __array, sizeof(ptr.array)); \ +} while(0) + +#define FILL_COMMON_FIELDS(ptr, event, data) \ +do { \ + FILL_FIELD(ptr, common_type, event, data); \ + FILL_FIELD(ptr, common_flags, event, data); \ + FILL_FIELD(ptr, common_preempt_count, event, data); \ + FILL_FIELD(ptr, common_pid, event, data); \ + FILL_FIELD(ptr, common_tgid, event, data); \ +} while (0) + struct trace_wakeup_event { u32 size; @@ -671,22 +695,32 @@ struct trace_wakeup_event { }; static void -process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event, +process_sched_wakeup_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { struct task_desc *waker, *wakee; + struct trace_wakeup_event wakeup_event; + + FILL_COMMON_FIELDS(wakeup_event, event, raw->data); + + FILL_ARRAY(wakeup_event, comm, event, raw->data); + FILL_FIELD(wakeup_event, pid, event, raw->data); + FILL_FIELD(wakeup_event, prio, event, raw->data); + FILL_FIELD(wakeup_event, success, event, raw->data); + FILL_FIELD(wakeup_event, cpu, event, raw->data); + if (verbose) { printf("sched_wakeup event %p\n", event); printf(" ... pid %d woke up %s/%d\n", - wakeup_event->common_pid, - wakeup_event->comm, - wakeup_event->pid); + wakeup_event.common_pid, + wakeup_event.comm, + wakeup_event.pid); } - waker = register_pid(wakeup_event->common_pid, ""); - wakee = register_pid(wakeup_event->pid, wakeup_event->comm); + waker = register_pid(wakeup_event.common_pid, ""); + wakee = register_pid(wakeup_event.pid, wakeup_event.comm); add_sched_event_wakeup(waker, timestamp, wakee); } @@ -714,13 +748,24 @@ struct trace_switch_event { unsigned long cpu_last_switched[MAX_CPUS]; static void -process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event, +process_sched_switch_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { + struct trace_switch_event switch_event; struct task_desc *prev, *next; u64 timestamp0; s64 delta; + FILL_COMMON_FIELDS(switch_event, event, raw->data); + + FILL_ARRAY(switch_event, prev_comm, event, raw->data); + FILL_FIELD(switch_event, prev_pid, event, raw->data); + FILL_FIELD(switch_event, prev_prio, event, raw->data); + FILL_FIELD(switch_event, prev_state, event, raw->data); + FILL_ARRAY(switch_event, next_comm, event, raw->data); + FILL_FIELD(switch_event, next_pid, event, raw->data); + FILL_FIELD(switch_event, next_prio, event, raw->data); + if (verbose) printf("sched_switch event %p\n", event); @@ -738,18 +783,18 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event if (verbose) { printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", - switch_event->prev_comm, switch_event->prev_pid, - switch_event->next_comm, switch_event->next_pid, + switch_event.prev_comm, switch_event.prev_pid, + switch_event.next_comm, switch_event.next_pid, delta); } - prev = register_pid(switch_event->prev_pid, switch_event->prev_comm); - next = register_pid(switch_event->next_pid, switch_event->next_comm); + prev = register_pid(switch_event.prev_pid, switch_event.prev_comm); + next = register_pid(switch_event.next_pid, switch_event.next_comm); cpu_last_switched[cpu] = timestamp; add_sched_event_run(prev, timestamp, delta); - add_sched_event_sleep(prev, timestamp, switch_event->prev_state); + add_sched_event_sleep(prev, timestamp, switch_event.prev_state); } struct trace_fork_event { @@ -768,16 +813,25 @@ struct trace_fork_event { }; static void -process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event, +process_sched_fork_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { + struct trace_fork_event fork_event; + + FILL_COMMON_FIELDS(fork_event, event, raw->data); + + FILL_ARRAY(fork_event, parent_comm, event, raw->data); + FILL_FIELD(fork_event, parent_pid, event, raw->data); + FILL_ARRAY(fork_event, child_comm, event, raw->data); + FILL_FIELD(fork_event, child_pid, event, raw->data); + if (verbose) { printf("sched_fork event %p\n", event); - printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); - printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); + printf("... parent: %s/%d\n", fork_event.parent_comm, fork_event.parent_pid); + printf("... child: %s/%d\n", fork_event.child_comm, fork_event.child_pid); } - register_pid(fork_event->parent_pid, fork_event->parent_comm); - register_pid(fork_event->child_pid, fork_event->child_comm); + register_pid(fork_event.parent_pid, fork_event.parent_comm); + register_pid(fork_event.child_pid, fork_event.child_comm); } static void process_sched_exit_event(struct event *event, @@ -791,10 +845,7 @@ static void process_raw_event(event_t *raw_event __used, void *more_data, int cpu, u64 timestamp, struct thread *thread) { - struct { - u32 size; - char data[0]; - } *raw = more_data; + struct raw_event_sample *raw = more_data; struct event *event; int type; @@ -802,13 +853,13 @@ process_raw_event(event_t *raw_event __used, void *more_data, event = trace_find_event(type); if (!strcmp(event->name, "sched_switch")) - process_sched_switch_event(more_data, event, cpu, timestamp, thread); + process_sched_switch_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) - process_sched_wakeup_event(more_data, event, cpu, timestamp, thread); + process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) - process_sched_wakeup_event(more_data, event, cpu, timestamp, thread); + process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_fork")) - process_sched_fork_event(more_data, event, cpu, timestamp, thread); + process_sched_fork_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_exit")) process_sched_exit_event(event, cpu, timestamp, thread); } diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 16cf2d51c4e..64d6e302751 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -1776,6 +1776,29 @@ static unsigned long long read_size(void *ptr, int size) } } +unsigned long long +raw_field_value(struct event *event, const char *name, void *data) +{ + struct format_field *field; + + field = find_any_field(event, name); + if (!field) + return 0ULL; + + return read_size(data + field->offset, field->size); +} + +void *raw_field_ptr(struct event *event, const char *name, void *data) +{ + struct format_field *field; + + field = find_any_field(event, name); + if (!field) + return NULL; + + return data + field->offset; +} + static int get_common_info(const char *type, int *offset, int *size) { struct event *event; diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index bc81612fd24..d35ebf1e29f 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -236,6 +236,9 @@ extern int header_page_data_size; int parse_header_page(char *buf, unsigned long size); int trace_parse_common_type(void *data); struct event *trace_find_event(int id); +unsigned long long +raw_field_value(struct event *event, const char *name, void *data); +void *raw_field_ptr(struct event *event, const char *name, void *data); void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters); -- cgit v1.2.3-70-g09d2 From ea57c4f5203d82c7844c54cdef54e972cf4e9d1f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 18:15:54 +0200 Subject: perf tools: Implement counter output multiplexing Finish the -M/--multiplex option implementation: - separate it out from group_fd - correctly set it via the ioctl and dont mmap counters that are multiplexed - modify the perf record event loop to deal with buffer-less counters. - remove the -g option from perf sched record - account for unordered events in perf sched latency - (add -f to perf sched record to ease measurements) - skip idle threads (pid==0) in latency output The result is better latency output by 'perf sched latency': ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- ksoftirqd/8 | 0.071 ms | 2 | avg: 0.458 ms | max: 0.913 ms | at-spi-registry | 0.609 ms | 19 | avg: 0.013 ms | max: 0.023 ms | perf | 3.316 ms | 16 | avg: 0.013 ms | max: 0.054 ms | Xorg | 0.392 ms | 19 | avg: 0.011 ms | max: 0.018 ms | sleep | 0.537 ms | 2 | avg: 0.009 ms | max: 0.009 ms | ----------------------------------------------------------------------------------- TOTAL: | 4.925 ms | 58 | --------------------------------------------- Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-record.c | 43 ++++++++++++++++++++++--------------- tools/perf/builtin-sched.c | 25 ++++++++++++++++++--- tools/perf/util/trace-event-parse.c | 6 ++++-- 3 files changed, 52 insertions(+), 22 deletions(-) (limited to 'tools/perf/util/trace-event-parse.c') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 79f99dba5be..5f3127e7a61 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -49,6 +49,7 @@ static int inherit_stat = 0; static int no_samples = 0; static int sample_address = 0; static int multiplex = 0; +static int multiplex_fd = -1; static long samples; static struct timeval last_read; @@ -471,23 +472,29 @@ try_again: */ if (group && group_fd == -1) group_fd = fd[nr_cpu][counter]; + if (multiplex && multiplex_fd == -1) + multiplex_fd = fd[nr_cpu][counter]; - event_array[nr_poll].fd = fd[nr_cpu][counter]; - event_array[nr_poll].events = POLLIN; - nr_poll++; - - mmap_array[nr_cpu][counter].counter = counter; - mmap_array[nr_cpu][counter].prev = 0; - mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; - mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0); - if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { - error("failed to mmap with %d (%s)\n", errno, strerror(errno)); - exit(-1); - } + if (multiplex && fd[nr_cpu][counter] != multiplex_fd) { + int ret; - if (multiplex && fd[nr_cpu][counter] != group_fd) - ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, group_fd); + ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd); + assert(ret != -1); + } else { + event_array[nr_poll].fd = fd[nr_cpu][counter]; + event_array[nr_poll].events = POLLIN; + nr_poll++; + + mmap_array[nr_cpu][counter].counter = counter; + mmap_array[nr_cpu][counter].prev = 0; + mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; + mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, + PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0); + if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { + error("failed to mmap with %d (%s)\n", errno, strerror(errno)); + exit(-1); + } + } ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); } @@ -618,8 +625,10 @@ static int __cmd_record(int argc, const char **argv) int hits = samples; for (i = 0; i < nr_cpu; i++) { - for (counter = 0; counter < nr_counters; counter++) - mmap_read(&mmap_array[i][counter]); + for (counter = 0; counter < nr_counters; counter++) { + if (mmap_array[i][counter].base) + mmap_read(&mmap_array[i][counter]); + } } if (hits == samples) { diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 3e003237c42..2ce87ef5a3e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -116,6 +116,8 @@ static u64 sum_fluct; static u64 run_avg; static unsigned long replay_repeat = 10; +static unsigned long nr_timestamps; +static unsigned long unordered_timestamps; #define TASK_STATE_TO_CHAR_STR "RSDTtZX" @@ -1109,8 +1111,11 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, if (atom->state != THREAD_SLEEPING) return; - if (atom->sched_out_time > timestamp) + nr_timestamps++; + if (atom->sched_out_time > timestamp) { + unordered_timestamps++; return; + } atom->state = THREAD_WAIT_CPU; atom->wake_up_time = timestamp; @@ -1130,6 +1135,11 @@ static void output_lat_thread(struct task_atoms *atom_list) if (!atom_list->nb_atoms) return; + /* + * Ignore idle threads: + */ + if (!atom_list->thread->pid) + return; all_runtime += atom_list->total_runtime; all_count += atom_list->nb_atoms; @@ -1301,8 +1311,16 @@ static void __cmd_lat(void) } printf("-----------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%9.3f ms |%9Ld |\n", + printf(" TOTAL: |%9.3f ms |%9Ld |", (double)all_runtime/1e6, all_count); + + if (unordered_timestamps && nr_timestamps) { + printf(" INFO: %.2f%% unordered events.\n", + (double)unordered_timestamps/(double)nr_timestamps*100.0); + } else { + printf("\n"); + } + printf("---------------------------------------------\n"); } @@ -1667,12 +1685,13 @@ static const char *record_args[] = { "-a", "-R", "-M", - "-g", + "-f", "-c", "1", "-e", "sched:sched_switch:r", "-e", "sched:sched_stat_wait:r", "-e", "sched:sched_stat_sleep:r", "-e", "sched:sched_stat_iowait:r", + "-e", "sched:sched_stat_runtime:r", "-e", "sched:sched_process_exit:r", "-e", "sched:sched_process_fork:r", "-e", "sched:sched_wakeup:r", diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 64d6e302751..f6a8437141c 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -2722,8 +2722,10 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs, type = trace_parse_common_type(data); event = trace_find_event(type); - if (!event) - die("ug! no event found for type %d", type); + if (!event) { + printf("ug! no event found for type %d\n", type); + return; + } pid = parse_common_pid(data); -- cgit v1.2.3-70-g09d2