diff options
Diffstat (limited to 'tools')
-rw-r--r-- | tools/perf/Documentation/perf-stat.txt | 5 | ||||
-rw-r--r-- | tools/perf/Makefile | 15 | ||||
-rw-r--r-- | tools/perf/bench/mem-memcpy-arch.h | 12 | ||||
-rw-r--r-- | tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 4 | ||||
-rw-r--r-- | tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 | ||||
-rw-r--r-- | tools/perf/bench/mem-memcpy.c | 219 | ||||
-rw-r--r-- | tools/perf/builtin-record.c | 2 | ||||
-rw-r--r-- | tools/perf/builtin-stat.c | 207 | ||||
-rw-r--r-- | tools/perf/builtin-top.c | 6 | ||||
-rw-r--r-- | tools/perf/feature-tests.mak | 4 | ||||
-rw-r--r-- | tools/perf/util/include/asm/cpufeature.h | 9 | ||||
-rw-r--r-- | tools/perf/util/include/asm/dwarf2.h | 11 | ||||
-rw-r--r-- | tools/perf/util/include/linux/linkage.h | 13 | ||||
-rw-r--r-- | tools/perf/util/probe-finder.h | 6 |
14 files changed, 409 insertions, 106 deletions
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 4b3a2d46b43..c405bcad6ac 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -53,6 +53,11 @@ comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2 In per-thread mode, this option is ignored. The -a option is still necessary to activate system-wide monitoring. Default is to count on all CPUs. +-A:: +--no-aggr:: +Do not aggregate counts across all monitored CPUs in system-wide mode (-a). +This option is only valid in system-wide mode. + EXAMPLES -------- diff --git a/tools/perf/Makefile b/tools/perf/Makefile index d1db0f676a4..e0db1978c85 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -185,7 +185,10 @@ ifeq ($(ARCH),i386) ARCH := x86 endif ifeq ($(ARCH),x86_64) + RAW_ARCH := x86_64 ARCH := x86 + ARCH_CFLAGS := -DARCH_X86_64 + ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S endif # CFLAGS and LDFLAGS are for the users to override from the command line. @@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h LIB_H += util/include/linux/rbtree.h LIB_H += util/include/linux/string.h LIB_H += util/include/linux/types.h +LIB_H += util/include/linux/linkage.h LIB_H += util/include/asm/asm-offsets.h LIB_H += util/include/asm/bug.h LIB_H += util/include/asm/byteorder.h @@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h LIB_H += util/include/asm/system.h LIB_H += util/include/asm/uaccess.h LIB_H += util/include/dwarf-regs.h +LIB_H += util/include/asm/dwarf2.h +LIB_H += util/include/asm/cpufeature.h LIB_H += perf.h LIB_H += util/cache.h LIB_H += util/callchain.h @@ -417,6 +423,7 @@ LIB_H += util/probe-finder.h LIB_H += util/probe-event.h LIB_H += util/pstack.h LIB_H += util/cpumap.h +LIB_H += $(ARCH_INCLUDE) LIB_OBJS += $(OUTPUT)util/abspath.o LIB_OBJS += $(OUTPUT)util/alias.o @@ -472,6 +479,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o # Benchmark modules BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o +ifeq ($(RAW_ARCH),x86_64) +BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o +endif BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o BUILTIN_OBJS += $(OUTPUT)builtin-diff.o @@ -507,7 +517,7 @@ PERFLIBS = $(LIB_FILE) -include config.mak ifndef NO_DWARF -FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS) +FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS) ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y) msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); NO_DWARF := 1 @@ -554,7 +564,7 @@ ifndef NO_DWARF ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined) msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled); else - BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT + BASIC_CFLAGS += -DDWARF_SUPPORT EXTLIBS += -lelf -ldw LIB_OBJS += $(OUTPUT)util/probe-finder.o endif # PERF_HAVE_DWARF_REGS @@ -898,6 +908,7 @@ BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \ LIB_OBJS += $(COMPAT_OBJS) ALL_CFLAGS += $(BASIC_CFLAGS) +ALL_CFLAGS += $(ARCH_CFLAGS) ALL_LDFLAGS += $(BASIC_LDFLAGS) export TAR INSTALL DESTDIR SHELL_PATH diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h new file mode 100644 index 00000000000..a72e36cb539 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -0,0 +1,12 @@ + +#ifdef ARCH_X86_64 + +#define MEMCPY_FN(fn, name, desc) \ + extern void *fn(void *, const void *, size_t); + +#include "mem-memcpy-x86-64-asm-def.h" + +#undef MEMCPY_FN + +#endif + diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h new file mode 100644 index 00000000000..d588b87696f --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -0,0 +1,4 @@ + +MEMCPY_FN(__memcpy, + "x86-64-unrolled", + "unrolled memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S new file mode 100644 index 00000000000..a57b66e853c --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -0,0 +1,2 @@ + +#include "../../../arch/x86/lib/memcpy_64.S" diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index 38dae746514..db82021f4b9 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -12,6 +12,7 @@ #include "../util/parse-options.h" #include "../util/header.h" #include "bench.h" +#include "mem-memcpy-arch.h" #include <stdio.h> #include <stdlib.h> @@ -23,8 +24,10 @@ static const char *length_str = "1MB"; static const char *routine = "default"; -static bool use_clock = false; +static bool use_clock; static int clock_fd; +static bool only_prefault; +static bool no_prefault; static const struct option options[] = { OPT_STRING('l', "length", &length_str, "1MB", @@ -34,19 +37,33 @@ static const struct option options[] = { "Specify routine to copy"), OPT_BOOLEAN('c', "clock", &use_clock, "Use CPU clock for measuring"), + OPT_BOOLEAN('o', "only-prefault", &only_prefault, + "Show only the result with page faults before memcpy()"), + OPT_BOOLEAN('n', "no-prefault", &no_prefault, + "Show only the result without page faults before memcpy()"), OPT_END() }; +typedef void *(*memcpy_t)(void *, const void *, size_t); + struct routine { const char *name; const char *desc; - void * (*fn)(void *dst, const void *src, size_t len); + memcpy_t fn; }; struct routine routines[] = { { "default", "Default memcpy() provided by glibc", memcpy }, +#ifdef ARCH_X86_64 + +#define MEMCPY_FN(fn, name, desc) { name, desc, fn }, +#include "mem-memcpy-x86-64-asm-def.h" +#undef MEMCPY_FN + +#endif + { NULL, NULL, NULL } @@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts) (double)ts->tv_usec / (double)1000000; } +static void alloc_mem(void **dst, void **src, size_t length) +{ + *dst = zalloc(length); + if (!dst) + die("memory allocation failed - maybe length is too large?\n"); + + *src = zalloc(length); + if (!src) + die("memory allocation failed - maybe length is too large?\n"); +} + +static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault) +{ + u64 clock_start = 0ULL, clock_end = 0ULL; + void *src = NULL, *dst = NULL; + + alloc_mem(&src, &dst, len); + + if (prefault) + fn(dst, src, len); + + clock_start = get_clock(); + fn(dst, src, len); + clock_end = get_clock(); + + free(src); + free(dst); + return clock_end - clock_start; +} + +static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault) +{ + struct timeval tv_start, tv_end, tv_diff; + void *src = NULL, *dst = NULL; + + alloc_mem(&src, &dst, len); + + if (prefault) + fn(dst, src, len); + + BUG_ON(gettimeofday(&tv_start, NULL)); + fn(dst, src, len); + BUG_ON(gettimeofday(&tv_end, NULL)); + + timersub(&tv_end, &tv_start, &tv_diff); + + free(src); + free(dst); + return (double)((double)len / timeval2double(&tv_diff)); +} + +#define pf (no_prefault ? 0 : 1) + +#define print_bps(x) do { \ + if (x < K) \ + printf(" %14lf B/Sec", x); \ + else if (x < K * K) \ + printf(" %14lfd KB/Sec", x / K); \ + else if (x < K * K * K) \ + printf(" %14lf MB/Sec", x / K / K); \ + else \ + printf(" %14lf GB/Sec", x / K / K / K); \ + } while (0) + int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used) { int i; - void *dst, *src; - size_t length; - double bps = 0.0; - struct timeval tv_start, tv_end, tv_diff; - u64 clock_start, clock_end, clock_diff; + size_t len; + double result_bps[2]; + u64 result_clock[2]; - clock_start = clock_end = clock_diff = 0ULL; argc = parse_options(argc, argv, options, bench_mem_memcpy_usage, 0); - tv_diff.tv_sec = 0; - tv_diff.tv_usec = 0; - length = (size_t)perf_atoll((char *)length_str); + if (use_clock) + init_clock(); + + len = (size_t)perf_atoll((char *)length_str); - if ((s64)length <= 0) { + result_clock[0] = result_clock[1] = 0ULL; + result_bps[0] = result_bps[1] = 0.0; + + if ((s64)len <= 0) { fprintf(stderr, "Invalid length:%s\n", length_str); return 1; } + /* same to without specifying either of prefault and no-prefault */ + if (only_prefault && no_prefault) + only_prefault = no_prefault = false; + for (i = 0; routines[i].name; i++) { if (!strcmp(routines[i].name, routine)) break; @@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv, return 1; } - dst = zalloc(length); - if (!dst) - die("memory allocation failed - maybe length is too large?\n"); - - src = zalloc(length); - if (!src) - die("memory allocation failed - maybe length is too large?\n"); - - if (bench_format == BENCH_FORMAT_DEFAULT) { - printf("# Copying %s Bytes from %p to %p ...\n\n", - length_str, src, dst); - } - - if (use_clock) { - init_clock(); - clock_start = get_clock(); - } else { - BUG_ON(gettimeofday(&tv_start, NULL)); - } - - routines[i].fn(dst, src, length); + if (bench_format == BENCH_FORMAT_DEFAULT) + printf("# Copying %s Bytes ...\n\n", length_str); - if (use_clock) { - clock_end = get_clock(); - clock_diff = clock_end - clock_start; + if (!only_prefault && !no_prefault) { + /* show both of results */ + if (use_clock) { + result_clock[0] = + do_memcpy_clock(routines[i].fn, len, false); + result_clock[1] = + do_memcpy_clock(routines[i].fn, len, true); + } else { + result_bps[0] = + do_memcpy_gettimeofday(routines[i].fn, + len, false); + result_bps[1] = + do_memcpy_gettimeofday(routines[i].fn, + len, true); + } } else { - BUG_ON(gettimeofday(&tv_end, NULL)); - timersub(&tv_end, &tv_start, &tv_diff); - bps = (double)((double)length / timeval2double(&tv_diff)); + if (use_clock) { + result_clock[pf] = + do_memcpy_clock(routines[i].fn, + len, only_prefault); + } else { + result_bps[pf] = + do_memcpy_gettimeofday(routines[i].fn, + len, only_prefault); + } } switch (bench_format) { case BENCH_FORMAT_DEFAULT: - if (use_clock) { - printf(" %14lf Clock/Byte\n", - (double)clock_diff / (double)length); - } else { - if (bps < K) - printf(" %14lf B/Sec\n", bps); - else if (bps < K * K) - printf(" %14lfd KB/Sec\n", bps / 1024); - else if (bps < K * K * K) - printf(" %14lf MB/Sec\n", bps / 1024 / 1024); - else { - printf(" %14lf GB/Sec\n", - bps / 1024 / 1024 / 1024); + if (!only_prefault && !no_prefault) { + if (use_clock) { + printf(" %14lf Clock/Byte\n", + (double)result_clock[0] + / (double)len); + printf(" %14lf Clock/Byte (with prefault)\n", + (double)result_clock[1] + / (double)len); + } else { + print_bps(result_bps[0]); + printf("\n"); + print_bps(result_bps[1]); + printf(" (with prefault)\n"); } + } else { + if (use_clock) { + printf(" %14lf Clock/Byte", + (double)result_clock[pf] + / (double)len); + } else + print_bps(result_bps[pf]); + + printf("%s\n", only_prefault ? " (with prefault)" : ""); } break; case BENCH_FORMAT_SIMPLE: - if (use_clock) { - printf("%14lf\n", - (double)clock_diff / (double)length); - } else - printf("%lf\n", bps); + if (!only_prefault && !no_prefault) { + if (use_clock) { + printf("%lf %lf\n", + (double)result_clock[0] / (double)len, + (double)result_clock[1] / (double)len); + } else { + printf("%lf %lf\n", + result_bps[0], result_bps[1]); + } + } else { + if (use_clock) { + printf("%lf\n", (double)result_clock[pf] + / (double)len); + } else + printf("%lf\n", result_bps[pf]); + } break; default: /* reaching this means there's some disaster: */ diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e2c2de201ee..3d2cb489980 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -326,7 +326,7 @@ try_again: goto try_again; } printf("\n"); - error("perfcounter syscall returned with %d (%s)\n", + error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", fd[nr_cpu][counter][thread_index], strerror(err)); #if defined(__i386__) || defined(__x86_64__) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a6b4d44f950..970a7f2a083 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -75,6 +75,7 @@ static int run_idx = 0; static int run_count = 1; static bool no_inherit = false; static bool scale = true; +static bool no_aggr = false; static pid_t target_pid = -1; static pid_t target_tid = -1; static pid_t *all_tids = NULL; @@ -89,6 +90,12 @@ static int *fd[MAX_NR_CPUS][MAX_COUNTERS]; static int event_scaled[MAX_COUNTERS]; +static struct { + u64 val; + u64 ena; + u64 run; +} cpu_counts[MAX_NR_CPUS][MAX_COUNTERS]; + static volatile int done = 0; struct stats @@ -136,19 +143,19 @@ static double stddev_stats(struct stats *stats) } struct stats event_res_stats[MAX_COUNTERS][3]; -struct stats runtime_nsecs_stats; +struct stats runtime_nsecs_stats[MAX_NR_CPUS]; +struct stats runtime_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; -struct stats runtime_cycles_stats; -struct stats runtime_branches_stats; #define MATCH_EVENT(t, c, counter) \ (attrs[counter].type == PERF_TYPE_##t && \ attrs[counter].config == PERF_COUNT_##c) #define ERR_PERF_OPEN \ -"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" +"counter %d, sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information." -static int create_perf_stat_counter(int counter) +static int create_perf_stat_counter(int counter, bool *perm_err) { struct perf_event_attr *attr = attrs + counter; int thread; @@ -164,11 +171,14 @@ static int create_perf_stat_counter(int counter) for (cpu = 0; cpu < nr_cpus; cpu++) { fd[cpu][counter][0] = sys_perf_event_open(attr, -1, cpumap[cpu], -1, 0); - if (fd[cpu][counter][0] < 0) - pr_debug(ERR_PERF_OPEN, counter, + if (fd[cpu][counter][0] < 0) { + if (errno == EPERM || errno == EACCES) + *perm_err = true; + error(ERR_PERF_OPEN, counter, fd[cpu][counter][0], strerror(errno)); - else + } else { ++ncreated; + } } } else { attr->inherit = !no_inherit; @@ -179,12 +189,15 @@ static int create_perf_stat_counter(int counter) for (thread = 0; thread < thread_num; thread++) { fd[0][counter][thread] = sys_perf_event_open(attr, all_tids[thread], -1, -1, 0); - if (fd[0][counter][thread] < 0) - pr_debug(ERR_PERF_OPEN, counter, + if (fd[0][counter][thread] < 0) { + if (errno == EPERM || errno == EACCES) + *perm_err = true; + error(ERR_PERF_OPEN, counter, fd[0][counter][thread], strerror(errno)); - else + } else { ++ncreated; + } } } @@ -205,8 +218,9 @@ static inline int nsec_counter(int counter) /* * Read out the results of a single counter: + * aggregate counts across CPUs in system-wide mode */ -static void read_counter(int counter) +static void read_counter_aggr(int counter) { u64 count[3], single_count[3]; int cpu; @@ -264,11 +278,58 @@ static void read_counter(int counter) * Save the full runtime - to allow normalization during printout: */ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) - update_stats(&runtime_nsecs_stats, count[0]); + update_stats(&runtime_nsecs_stats[0], count[0]); if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) - update_stats(&runtime_cycles_stats, count[0]); + update_stats(&runtime_cycles_stats[0], count[0]); if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) - update_stats(&runtime_branches_stats, count[0]); + update_stats(&runtime_branches_stats[0], count[0]); +} + +/* + * Read out the results of a single counter: + * do not aggregate counts across CPUs in system-wide mode + */ +static void read_counter(int counter) +{ + u64 count[3]; + int cpu; + size_t res, nv; + + count[0] = count[1] = count[2] = 0; + + nv = scale ? 3 : 1; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + + if (fd[cpu][counter][0] < 0) + continue; + + res = read(fd[cpu][counter][0], count, nv * sizeof(u64)); + + assert(res == nv * sizeof(u64)); + + close(fd[cpu][counter][0]); + fd[cpu][counter][0] = -1; + + if (scale) { + if (count[2] == 0) { + count[0] = 0; + } else if (count[2] < count[1]) { + count[0] = (unsigned long long) + ((double)count[0] * count[1] / count[2] + 0.5); + } + } + cpu_counts[cpu][counter].val = count[0]; /* scaled count */ + cpu_counts[cpu][counter].ena = count[1]; + cpu_counts[cpu][counter].run = count[2]; + + if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) + update_stats(&runtime_nsecs_stats[cpu], count[0]); + if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) + update_stats(&runtime_cycles_stats[cpu], count[0]); + if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) + update_stats(&runtime_branches_stats[cpu], count[0]); + } } static int run_perf_stat(int argc __used, const char **argv) @@ -277,6 +338,7 @@ static int run_perf_stat(int argc __used, const char **argv) int status = 0; int counter, ncreated = 0; int child_ready_pipe[2], go_pipe[2]; + bool perm_err = false; const bool forks = (argc > 0); char buf; @@ -335,12 +397,15 @@ static int run_perf_stat(int argc __used, const char **argv) } for (counter = 0; counter < nr_counters; counter++) - ncreated += create_perf_stat_counter(counter); - - if (ncreated == 0) { - pr_err("No permission to collect %sstats.\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n", - system_wide ? "system-wide " : ""); + ncreated += create_perf_stat_counter(counter, &perm_err); + + if (ncreated < nr_counters) { + if (perm_err) + error("You may not have permission to collect %sstats.\n" + "\t Consider tweaking" + " /proc/sys/kernel/perf_event_paranoid or running as root.", + system_wide ? "system-wide " : ""); + die("Not all events could be opened.\n"); if (child_pid != -1) kill(child_pid, SIGTERM); return -1; @@ -362,9 +427,13 @@ static int run_perf_stat(int argc __used, const char **argv) update_stats(&walltime_nsecs_stats, t1 - t0); - for (counter = 0; counter < nr_counters; counter++) - read_counter(counter); - + if (no_aggr) { + for (counter = 0; counter < nr_counters; counter++) + read_counter(counter); + } else { + for (counter = 0; counter < nr_counters; counter++) + read_counter_aggr(counter); + } return WEXITSTATUS(status); } @@ -377,11 +446,15 @@ static void print_noise(int counter, double avg) 100 * stddev_stats(&event_res_stats[counter][0]) / avg); } -static void nsec_printout(int counter, double avg) +static void nsec_printout(int cpu, int counter, double avg) { double msecs = avg / 1e6; - fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); + if (no_aggr) + fprintf(stderr, "CPU%-4d %18.6f %-24s", + cpumap[cpu], msecs, event_name(counter)); + else + fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter)); if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { fprintf(stderr, " # %10.3f CPUs ", @@ -389,33 +462,41 @@ static void nsec_printout(int counter, double avg) } } -static void abs_printout(int counter, double avg) +static void abs_printout(int cpu, int counter, double avg) { double total, ratio = 0.0; + char cpustr[16] = { '\0', }; + + if (no_aggr) + sprintf(cpustr, "CPU%-4d", cpumap[cpu]); + else + cpu = 0; if (big_num) - fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter)); + fprintf(stderr, "%s %'18.0f %-24s", + cpustr, avg, event_name(counter)); else - fprintf(stderr, " %18.0f %-24s", avg, event_name(counter)); + fprintf(stderr, "%s %18.0f %-24s", + cpustr, avg, event_name(counter)); if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { - total = avg_stats(&runtime_cycles_stats); + total = avg_stats(&runtime_cycles_stats[cpu]); if (total) ratio = avg / total; fprintf(stderr, " # %10.3f IPC ", ratio); } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && - runtime_branches_stats.n != 0) { - total = avg_stats(&runtime_branches_stats); + runtime_branches_stats[cpu].n != 0) { + total = avg_stats(&runtime_branches_stats[cpu]); if (total) ratio = avg * 100 / total; fprintf(stderr, " # %10.3f %% ", ratio); - } else if (runtime_nsecs_stats.n != 0) { - total = avg_stats(&runtime_nsecs_stats); + } else if (runtime_nsecs_stats[cpu].n != 0) { + total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) ratio = 1000.0 * avg / total; @@ -426,8 +507,9 @@ static void abs_printout(int counter, double avg) /* * Print out the results of a single counter: + * aggregated counts in system-wide mode */ -static void print_counter(int counter) +static void print_counter_aggr(int counter) { double avg = avg_stats(&event_res_stats[counter][0]); int scaled = event_scaled[counter]; @@ -439,9 +521,9 @@ static void print_counter(int counter) } if (nsec_counter(counter)) - nsec_printout(counter, avg); + nsec_printout(-1, counter, avg); else - abs_printout(counter, avg); + abs_printout(-1, counter, avg); print_noise(counter, avg); @@ -458,6 +540,42 @@ static void print_counter(int counter) fprintf(stderr, "\n"); } +/* + * Print out the results of a single counter: + * does not use aggregated count in system-wide + */ +static void print_counter(int counter) +{ + u64 ena, run, val; + int cpu; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + val = cpu_counts[cpu][counter].val; + ena = cpu_counts[cpu][counter].ena; + run = cpu_counts[cpu][counter].run; + if (run == 0 || ena == 0) { + fprintf(stderr, "CPU%-4d %18s %-24s", cpumap[cpu], + "<not counted>", event_name(counter)); + + fprintf(stderr, "\n"); + continue; + } + + if (nsec_counter(counter)) + nsec_printout(cpu, counter, val); + else + abs_printout(cpu, counter, val); + + print_noise(counter, 1.0); + + if (run != ena) { + fprintf(stderr, " (scaled from %.2f%%)", + 100.0 * run / ena); + } + fprintf(stderr, "\n"); + } +} + static void print_stat(int argc, const char **argv) { int i, counter; @@ -480,8 +598,13 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " (%d runs)", run_count); fprintf(stderr, ":\n\n"); - for (counter = 0; counter < nr_counters; counter++) - print_counter(counter); + if (no_aggr) { + for (counter = 0; counter < nr_counters; counter++) + print_counter(counter); + } else { + for (counter = 0; counter < nr_counters; counter++) + print_counter_aggr(counter); + } fprintf(stderr, "\n"); fprintf(stderr, " %18.9f seconds time elapsed", @@ -545,6 +668,8 @@ static const struct option options[] = { "print large numbers with thousands\' separators"), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to monitor in system-wide"), + OPT_BOOLEAN('A', "no-aggr", &no_aggr, + "disable CPU count aggregation"), OPT_END() }; @@ -562,6 +687,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) if (run_count <= 0) usage_with_options(stat_usage, options); + /* no_aggr is for system-wide only */ + if (no_aggr && !system_wide) + usage_with_options(stat_usage, options); + /* Set attrs and nr_counters if no event is selected and !null_run */ if (!null_run && !nr_counters) { memcpy(attrs, default_attrs, sizeof(default_attrs)); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index dd625808c2a..3d2b47d5121 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1214,7 +1214,9 @@ try_again: int err = errno; if (err == EPERM || err == EACCES) - die("No permission - are you root?\n"); + die("Permission error - are you root?\n" + "\t Consider tweaking" + " /proc/sys/kernel/perf_event_paranoid.\n"); /* * If it's cycles then fall back to hrtimer * based cpu-clock-tick sw counter, which @@ -1231,7 +1233,7 @@ try_again: goto try_again; } printf("\n"); - error("perfcounter syscall returned with %d (%s)\n", + error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n", fd[i][counter][thread_index], strerror(err)); die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak index b253db634f0..b041ca67a2c 100644 --- a/tools/perf/feature-tests.mak +++ b/tools/perf/feature-tests.mak @@ -9,8 +9,8 @@ endef ifndef NO_DWARF define SOURCE_DWARF #include <dwarf.h> -#include <libdw.h> -#include <version.h> +#include <elfutils/libdw.h> +#include <elfutils/version.h> #ifndef _ELFUTILS_PREREQ #error #endif diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h new file mode 100644 index 00000000000..acffd5e4d1d --- /dev/null +++ b/tools/perf/util/include/asm/cpufeature.h @@ -0,0 +1,9 @@ + +#ifndef PERF_CPUFEATURE_H +#define PERF_CPUFEATURE_H + +/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */ + +#define X86_FEATURE_REP_GOOD 0 + +#endif /* PERF_CPUFEATURE_H */ diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h new file mode 100644 index 00000000000..bb4198e7837 --- /dev/null +++ b/tools/perf/util/include/asm/dwarf2.h @@ -0,0 +1,11 @@ + +#ifndef PERF_DWARF2_H +#define PERF_DWARF2_H + +/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */ + +#define CFI_STARTPROC +#define CFI_ENDPROC + +#endif /* PERF_DWARF2_H */ + diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h new file mode 100644 index 00000000000..06387cffe12 --- /dev/null +++ b/tools/perf/util/include/linux/linkage.h @@ -0,0 +1,13 @@ + +#ifndef PERF_LINUX_LINKAGE_H_ +#define PERF_LINUX_LINKAGE_H_ + +/* linkage.h ... for including arch/x86/lib/memcpy_64.S */ + +#define ENTRY(name) \ + .globl name; \ + name: + +#define ENDPROC(name) + +#endif /* PERF_LINUX_LINKAGE_H_ */ diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index bba69d45569..beaefc3c122 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -34,9 +34,9 @@ extern int find_available_vars_at(int fd, struct perf_probe_event *pev, bool externs); #include <dwarf.h> -#include <libdw.h> -#include <libdwfl.h> -#include <version.h> +#include <elfutils/libdw.h> +#include <elfutils/libdwfl.h> +#include <elfutils/version.h> struct probe_finder { struct perf_probe_event *pev; /* Target probe event */ |