diff options
Diffstat (limited to 'tools/perf/builtin-stat.c')
-rw-r--r-- | tools/perf/builtin-stat.c | 1075 |
1 files changed, 842 insertions, 233 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a6b4d44f950..a9f06715e44 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -6,24 +6,28 @@ * * Sample output: - $ perf stat ~/hackbench 10 - Time: 0.104 + $ perf stat ./hackbench 10 - Performance counter stats for '/home/mingo/hackbench': + Time: 0.118 - 1255.538611 task clock ticks # 10.143 CPU utilization factor - 54011 context switches # 0.043 M/sec - 385 CPU migrations # 0.000 M/sec - 17755 pagefaults # 0.014 M/sec - 3808323185 CPU cycles # 3033.219 M/sec - 1575111190 instructions # 1254.530 M/sec - 17367895 cache references # 13.833 M/sec - 7674421 cache misses # 6.112 M/sec + Performance counter stats for './hackbench 10': - Wall-clock time elapsed: 123.786620 msecs + 1708.761321 task-clock # 11.037 CPUs utilized + 41,190 context-switches # 0.024 M/sec + 6,735 CPU-migrations # 0.004 M/sec + 17,318 page-faults # 0.010 M/sec + 5,205,202,243 cycles # 3.046 GHz + 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle + 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle + 2,603,501,247 instructions # 0.50 insns per cycle + # 1.48 stalled cycles per insn + 484,357,498 branches # 283.455 M/sec + 6,388,934 branch-misses # 1.32% of all branches + + 0.154822978 seconds time elapsed * - * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> * * Improvements and fixes by: * @@ -43,15 +47,21 @@ #include "util/parse-options.h" #include "util/parse-events.h" #include "util/event.h" +#include "util/evlist.h" +#include "util/evsel.h" #include "util/debug.h" +#include "util/color.h" #include "util/header.h" #include "util/cpumap.h" #include "util/thread.h" +#include "util/thread_map.h" #include <sys/prctl.h> #include <math.h> #include <locale.h> +#define DEFAULT_SEPARATOR " " + static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, @@ -60,34 +70,127 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, }; +/* + * Detailed stats (-d), covering the L1 and last level data caches: + */ +static struct perf_event_attr detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + +/* + * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: + */ +static struct perf_event_attr very_detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1I << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1I << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + +}; + +/* + * Very, very detailed stats (-d -d -d), adding prefetch events: + */ +static struct perf_event_attr very_very_detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + + + +struct perf_evlist *evsel_list; + static bool system_wide = false; -static int nr_cpus = 0; static int run_idx = 0; static int run_count = 1; static bool no_inherit = false; static bool scale = true; +static bool no_aggr = false; static pid_t target_pid = -1; static pid_t target_tid = -1; -static pid_t *all_tids = NULL; -static int thread_num = 0; static pid_t child_pid = -1; static bool null_run = false; -static bool big_num = false; +static int detailed_run = 0; +static bool sync_run = false; +static bool big_num = true; +static int big_num_opt = -1; static const char *cpu_list; - - -static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; - -static int event_scaled[MAX_COUNTERS]; +static const char *csv_sep = NULL; +static bool csv_output = false; static volatile int done = 0; @@ -96,6 +199,22 @@ struct stats double n, mean, M2; }; +struct perf_stat { + struct stats res_stats[3]; +}; + +static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) +{ + evsel->priv = zalloc(sizeof(struct perf_stat)); + return evsel->priv == NULL ? -ENOMEM : 0; +} + +static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) +{ + free(evsel->priv); + evsel->priv = NULL; +} + static void update_stats(struct stats *stats, u64 val) { double delta; @@ -135,154 +254,143 @@ static double stddev_stats(struct stats *stats) return sqrt(variance_mean); } -struct stats event_res_stats[MAX_COUNTERS][3]; -struct stats runtime_nsecs_stats; +struct stats runtime_nsecs_stats[MAX_NR_CPUS]; +struct stats runtime_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; +struct stats runtime_branches_stats[MAX_NR_CPUS]; +struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; +struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; +struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; +struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; +struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; +struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; -struct stats runtime_cycles_stats; -struct stats runtime_branches_stats; - -#define MATCH_EVENT(t, c, counter) \ - (attrs[counter].type == PERF_TYPE_##t && \ - attrs[counter].config == PERF_COUNT_##c) -#define ERR_PERF_OPEN \ -"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" - -static int create_perf_stat_counter(int counter) +static int create_perf_stat_counter(struct perf_evsel *evsel) { - struct perf_event_attr *attr = attrs + counter; - int thread; - int ncreated = 0; + struct perf_event_attr *attr = &evsel->attr; if (scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - if (system_wide) { - int cpu; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - fd[cpu][counter][0] = sys_perf_event_open(attr, - -1, cpumap[cpu], -1, 0); - if (fd[cpu][counter][0] < 0) - pr_debug(ERR_PERF_OPEN, counter, - fd[cpu][counter][0], strerror(errno)); - else - ++ncreated; - } - } else { - attr->inherit = !no_inherit; - if (target_pid == -1 && target_tid == -1) { - attr->disabled = 1; - attr->enable_on_exec = 1; - } - for (thread = 0; thread < thread_num; thread++) { - fd[0][counter][thread] = sys_perf_event_open(attr, - all_tids[thread], -1, -1, 0); - if (fd[0][counter][thread] < 0) - pr_debug(ERR_PERF_OPEN, counter, - fd[0][counter][thread], - strerror(errno)); - else - ++ncreated; - } + attr->inherit = !no_inherit; + + if (system_wide) + return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false); + + if (target_pid == -1 && target_tid == -1) { + attr->disabled = 1; + attr->enable_on_exec = 1; } - return ncreated; + return perf_evsel__open_per_thread(evsel, evsel_list->threads, false); } /* * Does the counter have nsecs as a unit? */ -static inline int nsec_counter(int counter) +static inline int nsec_counter(struct perf_evsel *evsel) { - if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) || - MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) + if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || + perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) return 1; return 0; } /* + * Update various tracking values we maintain to print + * more semantic information such as miss/hit ratios, + * instruction rates, etc: + */ +static void update_shadow_stats(struct perf_evsel *counter, u64 *count) +{ + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) + update_stats(&runtime_nsecs_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) + update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) + update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) + update_stats(&runtime_l1_dcache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) + update_stats(&runtime_l1_icache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) + update_stats(&runtime_ll_cache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) + update_stats(&runtime_dtlb_cache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) + update_stats(&runtime_itlb_cache_stats[0], count[0]); +} + +/* * Read out the results of a single counter: + * aggregate counts across CPUs in system-wide mode */ -static void read_counter(int counter) +static int read_counter_aggr(struct perf_evsel *counter) { - u64 count[3], single_count[3]; - int cpu; - size_t res, nv; - int scaled; - int i, thread; + struct perf_stat *ps = counter->priv; + u64 *count = counter->counts->aggr.values; + int i; - count[0] = count[1] = count[2] = 0; + if (__perf_evsel__read(counter, evsel_list->cpus->nr, + evsel_list->threads->nr, scale) < 0) + return -1; - nv = scale ? 3 : 1; - for (cpu = 0; cpu < nr_cpus; cpu++) { - for (thread = 0; thread < thread_num; thread++) { - if (fd[cpu][counter][thread] < 0) - continue; + for (i = 0; i < 3; i++) + update_stats(&ps->res_stats[i], count[i]); - res = read(fd[cpu][counter][thread], - single_count, nv * sizeof(u64)); - assert(res == nv * sizeof(u64)); + if (verbose) { + fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + event_name(counter), count[0], count[1], count[2]); + } - close(fd[cpu][counter][thread]); - fd[cpu][counter][thread] = -1; + /* + * Save the full runtime - to allow normalization during printout: + */ + update_shadow_stats(counter, count); - count[0] += single_count[0]; - if (scale) { - count[1] += single_count[1]; - count[2] += single_count[2]; - } - } - } + return 0; +} - scaled = 0; - if (scale) { - if (count[2] == 0) { - event_scaled[counter] = -1; - count[0] = 0; - return; - } +/* + * Read out the results of a single counter: + * do not aggregate counts across CPUs in system-wide mode + */ +static int read_counter(struct perf_evsel *counter) +{ + u64 *count; + int cpu; - if (count[2] < count[1]) { - event_scaled[counter] = 1; - count[0] = (unsigned long long) - ((double)count[0] * count[1] / count[2] + 0.5); - } - } + for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { + if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) + return -1; - for (i = 0; i < 3; i++) - update_stats(&event_res_stats[counter][i], count[i]); + count = counter->counts->cpu[cpu].values; - if (verbose) { - fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter), - count[0], count[1], count[2]); + update_shadow_stats(counter, count); } - /* - * Save the full runtime - to allow normalization during printout: - */ - if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) - update_stats(&runtime_nsecs_stats, count[0]); - if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) - update_stats(&runtime_cycles_stats, count[0]); - if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) - update_stats(&runtime_branches_stats, count[0]); + return 0; } static int run_perf_stat(int argc __used, const char **argv) { unsigned long long t0, t1; + struct perf_evsel *counter; int status = 0; - int counter, ncreated = 0; int child_ready_pipe[2], go_pipe[2]; const bool forks = (argc > 0); char buf; - if (!system_wide) - nr_cpus = 1; - if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { perror("failed to create pipes"); exit(1); @@ -322,7 +430,7 @@ static int run_perf_stat(int argc __used, const char **argv) } if (target_tid == -1 && target_pid == -1 && !system_wide) - all_tids[0] = child_pid; + evsel_list->threads->map[0] = child_pid; /* * Wait for the child to be ready to exec. @@ -334,15 +442,35 @@ static int run_perf_stat(int argc __used, const char **argv) close(child_ready_pipe[0]); } - for (counter = 0; counter < nr_counters; counter++) - ncreated += create_perf_stat_counter(counter); + list_for_each_entry(counter, &evsel_list->entries, node) { + if (create_perf_stat_counter(counter) < 0) { + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) { + if (verbose) + ui__warning("%s event is not supported by the kernel.\n", + event_name(counter)); + continue; + } + + if (errno == EPERM || errno == EACCES) { + error("You may not have permission to collect %sstats.\n" + "\t Consider tweaking" + " /proc/sys/kernel/perf_event_paranoid or running as root.", + system_wide ? "system-wide " : ""); + } else { + error("open_counter returned with %d (%s). " + "/bin/dmesg may provide additional information.\n", + errno, strerror(errno)); + } + if (child_pid != -1) + kill(child_pid, SIGTERM); + die("Not all events could be opened.\n"); + return -1; + } + } - if (ncreated == 0) { - pr_err("No permission to collect %sstats.\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n", - system_wide ? "system-wide " : ""); - if (child_pid != -1) - kill(child_pid, SIGTERM); + if (perf_evlist__set_filters(evsel_list)) { + error("failed to set filter with %d (%s)\n", errno, + strerror(errno)); return -1; } @@ -362,136 +490,501 @@ static int run_perf_stat(int argc __used, const char **argv) update_stats(&walltime_nsecs_stats, t1 - t0); - for (counter = 0; counter < nr_counters; counter++) - read_counter(counter); + if (no_aggr) { + list_for_each_entry(counter, &evsel_list->entries, node) { + read_counter(counter); + perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1); + } + } else { + list_for_each_entry(counter, &evsel_list->entries, node) { + read_counter_aggr(counter); + perf_evsel__close_fd(counter, evsel_list->cpus->nr, + evsel_list->threads->nr); + } + } return WEXITSTATUS(status); } -static void print_noise(int counter, double avg) +static void print_noise_pct(double total, double avg) +{ + double pct = 0.0; + + if (avg) + pct = 100.0*total/avg; + + fprintf(stderr, " ( +-%6.2f%% )", pct); +} + +static void print_noise(struct perf_evsel *evsel, double avg) { + struct perf_stat *ps; + if (run_count == 1) return; - fprintf(stderr, " ( +- %7.3f%% )", - 100 * stddev_stats(&event_res_stats[counter][0]) / avg); + ps = evsel->priv; + print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); } -static void nsec_printout(int counter, double avg) +static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) { double msecs = avg / 1e6; + char cpustr[16] = { '\0', }; + const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; - fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); + if (no_aggr) + sprintf(cpustr, "CPU%*d%s", + csv_output ? 0 : -4, + evsel_list->cpus->map[cpu], csv_sep); - if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { - fprintf(stderr, " # %10.3f CPUs ", - avg / avg_stats(&walltime_nsecs_stats)); - } + fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); + + if (evsel->cgrp) + fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); + + if (csv_output) + return; + + if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) + fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); +} + +static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 50.0) + color = PERF_COLOR_RED; + else if (ratio > 30.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 10.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " frontend cycles idle "); +} + +static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 20.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " backend cycles idle "); +} + +static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_branches_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all branches "); +} + +static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_l1_dcache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all L1-dcache hits "); } -static void abs_printout(int counter, double avg) +static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg) { double total, ratio = 0.0; + const char *color; - if (big_num) - fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter)); + total = avg_stats(&runtime_l1_icache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all L1-icache hits "); +} + +static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_dtlb_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all dTLB cache hits "); +} + +static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_itlb_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all iTLB cache hits "); +} + +static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_ll_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all LL-cache hits "); +} + +static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) +{ + double total, ratio = 0.0; + char cpustr[16] = { '\0', }; + const char *fmt; + + if (csv_output) + fmt = "%s%.0f%s%s"; + else if (big_num) + fmt = "%s%'18.0f%s%-25s"; else - fprintf(stderr, " %18.0f %-24s", avg, event_name(counter)); + fmt = "%s%18.0f%s%-25s"; - if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { - total = avg_stats(&runtime_cycles_stats); + if (no_aggr) + sprintf(cpustr, "CPU%*d%s", + csv_output ? 0 : -4, + evsel_list->cpus->map[cpu], csv_sep); + else + cpu = 0; + + fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel)); + + if (evsel->cgrp) + fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); + + if (csv_output) + return; + + if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { + total = avg_stats(&runtime_cycles_stats[cpu]); if (total) ratio = avg / total; - fprintf(stderr, " # %10.3f IPC ", ratio); - } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && - runtime_branches_stats.n != 0) { - total = avg_stats(&runtime_branches_stats); + fprintf(stderr, " # %5.2f insns per cycle ", ratio); + + total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); + total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); + + if (total && avg) { + ratio = total / avg; + fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); + } + + } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && + runtime_branches_stats[cpu].n != 0) { + print_branch_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_l1_dcache_stats[cpu].n != 0) { + print_l1_dcache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_l1_icache_stats[cpu].n != 0) { + print_l1_icache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_dtlb_cache_stats[cpu].n != 0) { + print_dtlb_cache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_itlb_cache_stats[cpu].n != 0) { + print_itlb_cache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_ll_cache_stats[cpu].n != 0) { + print_ll_cache_misses(cpu, evsel, avg); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && + runtime_cacherefs_stats[cpu].n != 0) { + total = avg_stats(&runtime_cacherefs_stats[cpu]); if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %% ", ratio); + fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); + + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { + print_stalled_cycles_frontend(cpu, evsel, avg); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { + print_stalled_cycles_backend(cpu, evsel, avg); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { + total = avg_stats(&runtime_nsecs_stats[cpu]); + + if (total) + ratio = 1.0 * avg / total; - } else if (runtime_nsecs_stats.n != 0) { - total = avg_stats(&runtime_nsecs_stats); + fprintf(stderr, " # %8.3f GHz ", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { + total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) ratio = 1000.0 * avg / total; - fprintf(stderr, " # %10.3f M/sec", ratio); + fprintf(stderr, " # %8.3f M/sec ", ratio); + } else { + fprintf(stderr, " "); } } /* * Print out the results of a single counter: + * aggregated counts in system-wide mode */ -static void print_counter(int counter) +static void print_counter_aggr(struct perf_evsel *counter) { - double avg = avg_stats(&event_res_stats[counter][0]); - int scaled = event_scaled[counter]; + struct perf_stat *ps = counter->priv; + double avg = avg_stats(&ps->res_stats[0]); + int scaled = counter->counts->scaled; if (scaled == -1) { - fprintf(stderr, " %18s %-24s\n", - "<not counted>", event_name(counter)); + fprintf(stderr, "%*s%s%*s", + csv_output ? 0 : 18, + "<not counted>", + csv_sep, + csv_output ? 0 : -24, + event_name(counter)); + + if (counter->cgrp) + fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); + + fputc('\n', stderr); return; } if (nsec_counter(counter)) - nsec_printout(counter, avg); + nsec_printout(-1, counter, avg); else - abs_printout(counter, avg); + abs_printout(-1, counter, avg); + + if (csv_output) { + fputc('\n', stderr); + return; + } print_noise(counter, avg); if (scaled) { double avg_enabled, avg_running; - avg_enabled = avg_stats(&event_res_stats[counter][1]); - avg_running = avg_stats(&event_res_stats[counter][2]); + avg_enabled = avg_stats(&ps->res_stats[1]); + avg_running = avg_stats(&ps->res_stats[2]); - fprintf(stderr, " (scaled from %.2f%%)", - 100 * avg_running / avg_enabled); + fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled); } - fprintf(stderr, "\n"); } +/* + * Print out the results of a single counter: + * does not use aggregated count in system-wide + */ +static void print_counter(struct perf_evsel *counter) +{ + u64 ena, run, val; + int cpu; + + for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { + val = counter->counts->cpu[cpu].val; + ena = counter->counts->cpu[cpu].ena; + run = counter->counts->cpu[cpu].run; + if (run == 0 || ena == 0) { + fprintf(stderr, "CPU%*d%s%*s%s%*s", + csv_output ? 0 : -4, + evsel_list->cpus->map[cpu], csv_sep, + csv_output ? 0 : 18, + "<not counted>", csv_sep, + csv_output ? 0 : -24, + event_name(counter)); + + if (counter->cgrp) + fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); + + fputc('\n', stderr); + continue; + } + + if (nsec_counter(counter)) + nsec_printout(cpu, counter, val); + else + abs_printout(cpu, counter, val); + + if (!csv_output) { + print_noise(counter, 1.0); + + if (run != ena) + fprintf(stderr, " (%.2f%%)", 100.0 * run / ena); + } + fputc('\n', stderr); + } +} + static void print_stat(int argc, const char **argv) { - int i, counter; + struct perf_evsel *counter; + int i; fflush(stdout); - fprintf(stderr, "\n"); - fprintf(stderr, " Performance counter stats for "); - if(target_pid == -1 && target_tid == -1) { - fprintf(stderr, "\'%s", argv[0]); - for (i = 1; i < argc; i++) - fprintf(stderr, " %s", argv[i]); - } else if (target_pid != -1) - fprintf(stderr, "process id \'%d", target_pid); - else - fprintf(stderr, "thread id \'%d", target_tid); - - fprintf(stderr, "\'"); - if (run_count > 1) - fprintf(stderr, " (%d runs)", run_count); - fprintf(stderr, ":\n\n"); + if (!csv_output) { + fprintf(stderr, "\n"); + fprintf(stderr, " Performance counter stats for "); + if(target_pid == -1 && target_tid == -1) { + fprintf(stderr, "\'%s", argv[0]); + for (i = 1; i < argc; i++) + fprintf(stderr, " %s", argv[i]); + } else if (target_pid != -1) + fprintf(stderr, "process id \'%d", target_pid); + else + fprintf(stderr, "thread id \'%d", target_tid); + + fprintf(stderr, "\'"); + if (run_count > 1) + fprintf(stderr, " (%d runs)", run_count); + fprintf(stderr, ":\n\n"); + } - for (counter = 0; counter < nr_counters; counter++) - print_counter(counter); + if (no_aggr) { + list_for_each_entry(counter, &evsel_list->entries, node) + print_counter(counter); + } else { + list_for_each_entry(counter, &evsel_list->entries, node) + print_counter_aggr(counter); + } - fprintf(stderr, "\n"); - fprintf(stderr, " %18.9f seconds time elapsed", - avg_stats(&walltime_nsecs_stats)/1e9); - if (run_count > 1) { - fprintf(stderr, " ( +- %7.3f%% )", - 100*stddev_stats(&walltime_nsecs_stats) / - avg_stats(&walltime_nsecs_stats)); + if (!csv_output) { + if (!null_run) + fprintf(stderr, "\n"); + fprintf(stderr, " %17.9f seconds time elapsed", + avg_stats(&walltime_nsecs_stats)/1e9); + if (run_count > 1) { + fprintf(stderr, " "); + print_noise_pct(stddev_stats(&walltime_nsecs_stats), + avg_stats(&walltime_nsecs_stats)); + } + fprintf(stderr, "\n\n"); } - fprintf(stderr, "\n\n"); } static volatile int signr = -1; @@ -521,10 +1014,19 @@ static const char * const stat_usage[] = { NULL }; +static int stat__set_big_num(const struct option *opt __used, + const char *s __used, int unset) +{ + big_num_opt = unset ? 0 : 1; + return 0; +} + static const struct option options[] = { - OPT_CALLBACK('e', "event", NULL, "event", + OPT_CALLBACK('e', "event", &evsel_list, "event", "event selector. use 'perf list' to list available events", parse_events), + OPT_CALLBACK(0, "filter", &evsel_list, "filter", + "event filter", parse_filter), OPT_BOOLEAN('i', "no-inherit", &no_inherit, "child tasks do not inherit counters"), OPT_INTEGER('p', "pid", &target_pid, @@ -541,64 +1043,162 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), - OPT_BOOLEAN('B', "big-num", &big_num, - "print large numbers with thousands\' separators"), + OPT_INCR('d', "detailed", &detailed_run, + "detailed run - start a lot of events"), + OPT_BOOLEAN('S', "sync", &sync_run, + "call sync() before starting a run"), + OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, + "print large numbers with thousands\' separators", + stat__set_big_num), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to monitor in system-wide"), + OPT_BOOLEAN('A', "no-aggr", &no_aggr, + "disable CPU count aggregation"), + OPT_STRING('x', "field-separator", &csv_sep, "separator", + "print counts with custom separator"), + OPT_CALLBACK('G', "cgroup", &evsel_list, "name", + "monitor event in cgroup name only", + parse_cgroups), OPT_END() }; +/* + * Add default attributes, if there were no attributes specified or + * if -d/--detailed, -d -d or -d -d -d is used: + */ +static int add_default_attributes(void) +{ + struct perf_evsel *pos; + size_t attr_nr = 0; + size_t c; + + /* Set attrs if no event is selected and !null_run: */ + if (null_run) + return 0; + + if (!evsel_list->nr_entries) { + for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { + pos = perf_evsel__new(default_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; + } + + /* Detailed events get appended to the event list: */ + + if (detailed_run < 1) + return 0; + + /* Append detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { + pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; + + if (detailed_run < 2) + return 0; + + /* Append very detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { + pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + if (detailed_run < 3) + return 0; + + /* Append very, very detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { + pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + + return 0; +} + int cmd_stat(int argc, const char **argv, const char *prefix __used) { - int status; - int i,j; + struct perf_evsel *pos; + int status = -ENOMEM; setlocale(LC_ALL, ""); + evsel_list = perf_evlist__new(NULL, NULL); + if (evsel_list == NULL) + return -ENOMEM; + argc = parse_options(argc, argv, options, stat_usage, PARSE_OPT_STOP_AT_NON_OPTION); + + if (csv_sep) + csv_output = true; + else + csv_sep = DEFAULT_SEPARATOR; + + /* + * let the spreadsheet do the pretty-printing + */ + if (csv_output) { + /* User explicitely passed -B? */ + if (big_num_opt == 1) { + fprintf(stderr, "-B option not supported with -x\n"); + usage_with_options(stat_usage, options); + } else /* Nope, so disable big number formatting */ + big_num = false; + } else if (big_num_opt == 0) /* User passed --no-big-num */ + big_num = false; + if (!argc && target_pid == -1 && target_tid == -1) usage_with_options(stat_usage, options); if (run_count <= 0) usage_with_options(stat_usage, options); - /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!null_run && !nr_counters) { - memcpy(attrs, default_attrs, sizeof(default_attrs)); - nr_counters = ARRAY_SIZE(default_attrs); + /* no_aggr, cgroup are for system-wide only */ + if ((no_aggr || nr_cgroups) && !system_wide) { + fprintf(stderr, "both cgroup and no-aggregation " + "modes only available in system-wide mode\n"); + + usage_with_options(stat_usage, options); } - if (system_wide) - nr_cpus = read_cpu_map(cpu_list); - else - nr_cpus = 1; + if (add_default_attributes()) + goto out; - if (nr_cpus < 1) + if (target_pid != -1) + target_tid = target_pid; + + evsel_list->threads = thread_map__new(target_pid, target_tid); + if (evsel_list->threads == NULL) { + pr_err("Problems finding threads of monitor\n"); usage_with_options(stat_usage, options); + } - if (target_pid != -1) { - target_tid = target_pid; - thread_num = find_all_tid(target_pid, &all_tids); - if (thread_num <= 0) { - fprintf(stderr, "Can't find all threads of pid %d\n", - target_pid); - usage_with_options(stat_usage, options); - } - } else { - all_tids=malloc(sizeof(pid_t)); - if (!all_tids) - return -ENOMEM; + if (system_wide) + evsel_list->cpus = cpu_map__new(cpu_list); + else + evsel_list->cpus = cpu_map__dummy_new(); - all_tids[0] = target_tid; - thread_num = 1; + if (evsel_list->cpus == NULL) { + perror("failed to parse CPUs map"); + usage_with_options(stat_usage, options); + return -1; } - for (i = 0; i < MAX_NR_CPUS; i++) { - for (j = 0; j < MAX_COUNTERS; j++) { - fd[i][j] = malloc(sizeof(int)*thread_num); - if (!fd[i][j]) - return -ENOMEM; - } + list_for_each_entry(pos, &evsel_list->entries, node) { + if (perf_evsel__alloc_stat_priv(pos) < 0 || + perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || + perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0) + goto out_free_fd; } /* @@ -616,11 +1216,20 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) for (run_idx = 0; run_idx < run_count; run_idx++) { if (run_count != 1 && verbose) fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); + + if (sync_run) + sync(); + status = run_perf_stat(argc, argv); } if (status != -1) print_stat(argc, argv); - +out_free_fd: + list_for_each_entry(pos, &evsel_list->entries, node) + perf_evsel__free_stat_priv(pos); + perf_evlist__delete_maps(evsel_list); +out: + perf_evlist__delete(evsel_list); return status; } |