summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/perf/Documentation/perf-stat.txt5
-rw-r--r--tools/perf/Makefile15
-rw-r--r--tools/perf/bench/mem-memcpy-arch.h12
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm-def.h4
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm.S2
-rw-r--r--tools/perf/bench/mem-memcpy.c219
-rw-r--r--tools/perf/builtin-record.c2
-rw-r--r--tools/perf/builtin-stat.c207
-rw-r--r--tools/perf/builtin-top.c6
-rw-r--r--tools/perf/feature-tests.mak4
-rw-r--r--tools/perf/util/include/asm/cpufeature.h9
-rw-r--r--tools/perf/util/include/asm/dwarf2.h11
-rw-r--r--tools/perf/util/include/linux/linkage.h13
-rw-r--r--tools/perf/util/probe-finder.h6
14 files changed, 409 insertions, 106 deletions
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 4b3a2d46b43..c405bcad6ac 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -53,6 +53,11 @@ comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2
In per-thread mode, this option is ignored. The -a option is still necessary
to activate system-wide monitoring. Default is to count on all CPUs.
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
+This option is only valid in system-wide mode.
+
EXAMPLES
--------
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index d1db0f676a4..e0db1978c85 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -185,7 +185,10 @@ ifeq ($(ARCH),i386)
ARCH := x86
endif
ifeq ($(ARCH),x86_64)
+ RAW_ARCH := x86_64
ARCH := x86
+ ARCH_CFLAGS := -DARCH_X86_64
+ ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
endif
# CFLAGS and LDFLAGS are for the users to override from the command line.
@@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h
LIB_H += util/include/linux/rbtree.h
LIB_H += util/include/linux/string.h
LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
LIB_H += util/include/asm/asm-offsets.h
LIB_H += util/include/asm/bug.h
LIB_H += util/include/asm/byteorder.h
@@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h
LIB_H += util/include/asm/system.h
LIB_H += util/include/asm/uaccess.h
LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
LIB_H += perf.h
LIB_H += util/cache.h
LIB_H += util/callchain.h
@@ -417,6 +423,7 @@ LIB_H += util/probe-finder.h
LIB_H += util/probe-event.h
LIB_H += util/pstack.h
LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)
LIB_OBJS += $(OUTPUT)util/abspath.o
LIB_OBJS += $(OUTPUT)util/alias.o
@@ -472,6 +479,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
# Benchmark modules
BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
@@ -507,7 +517,7 @@ PERFLIBS = $(LIB_FILE)
-include config.mak
ifndef NO_DWARF
-FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
+FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
NO_DWARF := 1
@@ -554,7 +564,7 @@ ifndef NO_DWARF
ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
else
- BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
+ BASIC_CFLAGS += -DDWARF_SUPPORT
EXTLIBS += -lelf -ldw
LIB_OBJS += $(OUTPUT)util/probe-finder.o
endif # PERF_HAVE_DWARF_REGS
@@ -898,6 +908,7 @@ BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
LIB_OBJS += $(COMPAT_OBJS)
ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
ALL_LDFLAGS += $(BASIC_LDFLAGS)
export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h
new file mode 100644
index 00000000000..a72e36cb539
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) \
+ extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
new file mode 100644
index 00000000000..d588b87696f
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,4 @@
+
+MEMCPY_FN(__memcpy,
+ "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
new file mode 100644
index 00000000000..a57b66e853c
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,2 @@
+
+#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 38dae746514..db82021f4b9 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -12,6 +12,7 @@
#include "../util/parse-options.h"
#include "../util/header.h"
#include "bench.h"
+#include "mem-memcpy-arch.h"
#include <stdio.h>
#include <stdlib.h>
@@ -23,8 +24,10 @@
static const char *length_str = "1MB";
static const char *routine = "default";
-static bool use_clock = false;
+static bool use_clock;
static int clock_fd;
+static bool only_prefault;
+static bool no_prefault;
static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
@@ -34,19 +37,33 @@ static const struct option options[] = {
"Specify routine to copy"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
+ OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+ "Show only the result with page faults before memcpy()"),
+ OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+ "Show only the result without page faults before memcpy()"),
OPT_END()
};
+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
struct routine {
const char *name;
const char *desc;
- void * (*fn)(void *dst, const void *src, size_t len);
+ memcpy_t fn;
};
struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
{ NULL,
NULL,
NULL }
@@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts)
(double)ts->tv_usec / (double)1000000;
}
+static void alloc_mem(void **dst, void **src, size_t length)
+{
+ *dst = zalloc(length);
+ if (!dst)
+ die("memory allocation failed - maybe length is too large?\n");
+
+ *src = zalloc(length);
+ if (!src)
+ die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+ u64 clock_start = 0ULL, clock_end = 0ULL;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ clock_start = get_clock();
+ fn(dst, src, len);
+ clock_end = get_clock();
+
+ free(src);
+ free(dst);
+ return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+ struct timeval tv_start, tv_end, tv_diff;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ BUG_ON(gettimeofday(&tv_start, NULL));
+ fn(dst, src, len);
+ BUG_ON(gettimeofday(&tv_end, NULL));
+
+ timersub(&tv_end, &tv_start, &tv_diff);
+
+ free(src);
+ free(dst);
+ return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do { \
+ if (x < K) \
+ printf(" %14lf B/Sec", x); \
+ else if (x < K * K) \
+ printf(" %14lfd KB/Sec", x / K); \
+ else if (x < K * K * K) \
+ printf(" %14lf MB/Sec", x / K / K); \
+ else \
+ printf(" %14lf GB/Sec", x / K / K / K); \
+ } while (0)
+
int bench_mem_memcpy(int argc, const char **argv,
const char *prefix __used)
{
int i;
- void *dst, *src;
- size_t length;
- double bps = 0.0;
- struct timeval tv_start, tv_end, tv_diff;
- u64 clock_start, clock_end, clock_diff;
+ size_t len;
+ double result_bps[2];
+ u64 result_clock[2];
- clock_start = clock_end = clock_diff = 0ULL;
argc = parse_options(argc, argv, options,
bench_mem_memcpy_usage, 0);
- tv_diff.tv_sec = 0;
- tv_diff.tv_usec = 0;
- length = (size_t)perf_atoll((char *)length_str);
+ if (use_clock)
+ init_clock();
+
+ len = (size_t)perf_atoll((char *)length_str);
- if ((s64)length <= 0) {
+ result_clock[0] = result_clock[1] = 0ULL;
+ result_bps[0] = result_bps[1] = 0.0;
+
+ if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}
+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
@@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv,
return 1;
}
- dst = zalloc(length);
- if (!dst)
- die("memory allocation failed - maybe length is too large?\n");
-
- src = zalloc(length);
- if (!src)
- die("memory allocation failed - maybe length is too large?\n");
-
- if (bench_format == BENCH_FORMAT_DEFAULT) {
- printf("# Copying %s Bytes from %p to %p ...\n\n",
- length_str, src, dst);
- }
-
- if (use_clock) {
- init_clock();
- clock_start = get_clock();
- } else {
- BUG_ON(gettimeofday(&tv_start, NULL));
- }
-
- routines[i].fn(dst, src, length);
+ if (bench_format == BENCH_FORMAT_DEFAULT)
+ printf("# Copying %s Bytes ...\n\n", length_str);
- if (use_clock) {
- clock_end = get_clock();
- clock_diff = clock_end - clock_start;
+ if (!only_prefault && !no_prefault) {
+ /* show both of results */
+ if (use_clock) {
+ result_clock[0] =
+ do_memcpy_clock(routines[i].fn, len, false);
+ result_clock[1] =
+ do_memcpy_clock(routines[i].fn, len, true);
+ } else {
+ result_bps[0] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, false);
+ result_bps[1] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, true);
+ }
} else {
- BUG_ON(gettimeofday(&tv_end, NULL));
- timersub(&tv_end, &tv_start, &tv_diff);
- bps = (double)((double)length / timeval2double(&tv_diff));
+ if (use_clock) {
+ result_clock[pf] =
+ do_memcpy_clock(routines[i].fn,
+ len, only_prefault);
+ } else {
+ result_bps[pf] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, only_prefault);
+ }
}
switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
- if (use_clock) {
- printf(" %14lf Clock/Byte\n",
- (double)clock_diff / (double)length);
- } else {
- if (bps < K)
- printf(" %14lf B/Sec\n", bps);
- else if (bps < K * K)
- printf(" %14lfd KB/Sec\n", bps / 1024);
- else if (bps < K * K * K)
- printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
- else {
- printf(" %14lf GB/Sec\n",
- bps / 1024 / 1024 / 1024);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte\n",
+ (double)result_clock[0]
+ / (double)len);
+ printf(" %14lf Clock/Byte (with prefault)\n",
+ (double)result_clock[1]
+ / (double)len);
+ } else {
+ print_bps(result_bps[0]);
+ printf("\n");
+ print_bps(result_bps[1]);
+ printf(" (with prefault)\n");
}
+ } else {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte",
+ (double)result_clock[pf]
+ / (double)len);
+ } else
+ print_bps(result_bps[pf]);
+
+ printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
- if (use_clock) {
- printf("%14lf\n",
- (double)clock_diff / (double)length);
- } else
- printf("%lf\n", bps);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf("%lf %lf\n",
+ (double)result_clock[0] / (double)len,
+ (double)result_clock[1] / (double)len);
+ } else {
+ printf("%lf %lf\n",
+ result_bps[0], result_bps[1]);
+ }
+ } else {
+ if (use_clock) {
+ printf("%lf\n", (double)result_clock[pf]
+ / (double)len);
+ } else
+ printf("%lf\n", result_bps[pf]);
+ }
break;
default:
/* reaching this means there's some disaster: */
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e2c2de201ee..3d2cb489980 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -326,7 +326,7 @@ try_again:
goto try_again;
}
printf("\n");
- error("perfcounter syscall returned with %d (%s)\n",
+ error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
fd[nr_cpu][counter][thread_index], strerror(err));
#if defined(__i386__) || defined(__x86_64__)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a6b4d44f950..970a7f2a083 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -75,6 +75,7 @@ static int run_idx = 0;
static int run_count = 1;
static bool no_inherit = false;
static bool scale = true;
+static bool no_aggr = false;
static pid_t target_pid = -1;
static pid_t target_tid = -1;
static pid_t *all_tids = NULL;
@@ -89,6 +90,12 @@ static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
static int event_scaled[MAX_COUNTERS];
+static struct {
+ u64 val;
+ u64 ena;
+ u64 run;
+} cpu_counts[MAX_NR_CPUS][MAX_COUNTERS];
+
static volatile int done = 0;
struct stats
@@ -136,19 +143,19 @@ static double stddev_stats(struct stats *stats)
}
struct stats event_res_stats[MAX_COUNTERS][3];
-struct stats runtime_nsecs_stats;
+struct stats runtime_nsecs_stats[MAX_NR_CPUS];
+struct stats runtime_cycles_stats[MAX_NR_CPUS];
+struct stats runtime_branches_stats[MAX_NR_CPUS];
struct stats walltime_nsecs_stats;
-struct stats runtime_cycles_stats;
-struct stats runtime_branches_stats;
#define MATCH_EVENT(t, c, counter) \
(attrs[counter].type == PERF_TYPE_##t && \
attrs[counter].config == PERF_COUNT_##c)
#define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
+"counter %d, sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information."
-static int create_perf_stat_counter(int counter)
+static int create_perf_stat_counter(int counter, bool *perm_err)
{
struct perf_event_attr *attr = attrs + counter;
int thread;
@@ -164,11 +171,14 @@ static int create_perf_stat_counter(int counter)
for (cpu = 0; cpu < nr_cpus; cpu++) {
fd[cpu][counter][0] = sys_perf_event_open(attr,
-1, cpumap[cpu], -1, 0);
- if (fd[cpu][counter][0] < 0)
- pr_debug(ERR_PERF_OPEN, counter,
+ if (fd[cpu][counter][0] < 0) {
+ if (errno == EPERM || errno == EACCES)
+ *perm_err = true;
+ error(ERR_PERF_OPEN, counter,
fd[cpu][counter][0], strerror(errno));
- else
+ } else {
++ncreated;
+ }
}
} else {
attr->inherit = !no_inherit;
@@ -179,12 +189,15 @@ static int create_perf_stat_counter(int counter)
for (thread = 0; thread < thread_num; thread++) {
fd[0][counter][thread] = sys_perf_event_open(attr,
all_tids[thread], -1, -1, 0);
- if (fd[0][counter][thread] < 0)
- pr_debug(ERR_PERF_OPEN, counter,
+ if (fd[0][counter][thread] < 0) {
+ if (errno == EPERM || errno == EACCES)
+ *perm_err = true;
+ error(ERR_PERF_OPEN, counter,
fd[0][counter][thread],
strerror(errno));
- else
+ } else {
++ncreated;
+ }
}
}
@@ -205,8 +218,9 @@ static inline int nsec_counter(int counter)
/*
* Read out the results of a single counter:
+ * aggregate counts across CPUs in system-wide mode
*/
-static void read_counter(int counter)
+static void read_counter_aggr(int counter)
{
u64 count[3], single_count[3];
int cpu;
@@ -264,11 +278,58 @@ static void read_counter(int counter)
* Save the full runtime - to allow normalization during printout:
*/
if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
- update_stats(&runtime_nsecs_stats, count[0]);
+ update_stats(&runtime_nsecs_stats[0], count[0]);
if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
- update_stats(&runtime_cycles_stats, count[0]);
+ update_stats(&runtime_cycles_stats[0], count[0]);
if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
- update_stats(&runtime_branches_stats, count[0]);
+ update_stats(&runtime_branches_stats[0], count[0]);
+}
+
+/*
+ * Read out the results of a single counter:
+ * do not aggregate counts across CPUs in system-wide mode
+ */
+static void read_counter(int counter)
+{
+ u64 count[3];
+ int cpu;
+ size_t res, nv;
+
+ count[0] = count[1] = count[2] = 0;
+
+ nv = scale ? 3 : 1;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+
+ if (fd[cpu][counter][0] < 0)
+ continue;
+
+ res = read(fd[cpu][counter][0], count, nv * sizeof(u64));
+
+ assert(res == nv * sizeof(u64));
+
+ close(fd[cpu][counter][0]);
+ fd[cpu][counter][0] = -1;
+
+ if (scale) {
+ if (count[2] == 0) {
+ count[0] = 0;
+ } else if (count[2] < count[1]) {
+ count[0] = (unsigned long long)
+ ((double)count[0] * count[1] / count[2] + 0.5);
+ }
+ }
+ cpu_counts[cpu][counter].val = count[0]; /* scaled count */
+ cpu_counts[cpu][counter].ena = count[1];
+ cpu_counts[cpu][counter].run = count[2];
+
+ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+ update_stats(&runtime_nsecs_stats[cpu], count[0]);
+ if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
+ update_stats(&runtime_cycles_stats[cpu], count[0]);
+ if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
+ update_stats(&runtime_branches_stats[cpu], count[0]);
+ }
}
static int run_perf_stat(int argc __used, const char **argv)
@@ -277,6 +338,7 @@ static int run_perf_stat(int argc __used, const char **argv)
int status = 0;
int counter, ncreated = 0;
int child_ready_pipe[2], go_pipe[2];
+ bool perm_err = false;
const bool forks = (argc > 0);
char buf;
@@ -335,12 +397,15 @@ static int run_perf_stat(int argc __used, const char **argv)
}
for (counter = 0; counter < nr_counters; counter++)
- ncreated += create_perf_stat_counter(counter);
-
- if (ncreated == 0) {
- pr_err("No permission to collect %sstats.\n"
- "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n",
- system_wide ? "system-wide " : "");
+ ncreated += create_perf_stat_counter(counter, &perm_err);
+
+ if (ncreated < nr_counters) {
+ if (perm_err)
+ error("You may not have permission to collect %sstats.\n"
+ "\t Consider tweaking"
+ " /proc/sys/kernel/perf_event_paranoid or running as root.",
+ system_wide ? "system-wide " : "");
+ die("Not all events could be opened.\n");
if (child_pid != -1)
kill(child_pid, SIGTERM);
return -1;
@@ -362,9 +427,13 @@ static int run_perf_stat(int argc __used, const char **argv)
update_stats(&walltime_nsecs_stats, t1 - t0);
- for (counter = 0; counter < nr_counters; counter++)
- read_counter(counter);
-
+ if (no_aggr) {
+ for (counter = 0; counter < nr_counters; counter++)
+ read_counter(counter);
+ } else {
+ for (counter = 0; counter < nr_counters; counter++)
+ read_counter_aggr(counter);
+ }
return WEXITSTATUS(status);
}
@@ -377,11 +446,15 @@ static void print_noise(int counter, double avg)
100 * stddev_stats(&event_res_stats[counter][0]) / avg);
}
-static void nsec_printout(int counter, double avg)
+static void nsec_printout(int cpu, int counter, double avg)
{
double msecs = avg / 1e6;
- fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter));
+ if (no_aggr)
+ fprintf(stderr, "CPU%-4d %18.6f %-24s",
+ cpumap[cpu], msecs, event_name(counter));
+ else
+ fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter));
if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
fprintf(stderr, " # %10.3f CPUs ",
@@ -389,33 +462,41 @@ static void nsec_printout(int counter, double avg)
}
}
-static void abs_printout(int counter, double avg)
+static void abs_printout(int cpu, int counter, double avg)
{
double total, ratio = 0.0;
+ char cpustr[16] = { '\0', };
+
+ if (no_aggr)
+ sprintf(cpustr, "CPU%-4d", cpumap[cpu]);
+ else
+ cpu = 0;
if (big_num)
- fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter));
+ fprintf(stderr, "%s %'18.0f %-24s",
+ cpustr, avg, event_name(counter));
else
- fprintf(stderr, " %18.0f %-24s", avg, event_name(counter));
+ fprintf(stderr, "%s %18.0f %-24s",
+ cpustr, avg, event_name(counter));
if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
- total = avg_stats(&runtime_cycles_stats);
+ total = avg_stats(&runtime_cycles_stats[cpu]);
if (total)
ratio = avg / total;
fprintf(stderr, " # %10.3f IPC ", ratio);
} else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
- runtime_branches_stats.n != 0) {
- total = avg_stats(&runtime_branches_stats);
+ runtime_branches_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_branches_stats[cpu]);
if (total)
ratio = avg * 100 / total;
fprintf(stderr, " # %10.3f %% ", ratio);
- } else if (runtime_nsecs_stats.n != 0) {
- total = avg_stats(&runtime_nsecs_stats);
+ } else if (runtime_nsecs_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_nsecs_stats[cpu]);
if (total)
ratio = 1000.0 * avg / total;
@@ -426,8 +507,9 @@ static void abs_printout(int counter, double avg)
/*
* Print out the results of a single counter:
+ * aggregated counts in system-wide mode
*/
-static void print_counter(int counter)
+static void print_counter_aggr(int counter)
{
double avg = avg_stats(&event_res_stats[counter][0]);
int scaled = event_scaled[counter];
@@ -439,9 +521,9 @@ static void print_counter(int counter)
}
if (nsec_counter(counter))
- nsec_printout(counter, avg);
+ nsec_printout(-1, counter, avg);
else
- abs_printout(counter, avg);
+ abs_printout(-1, counter, avg);
print_noise(counter, avg);
@@ -458,6 +540,42 @@ static void print_counter(int counter)
fprintf(stderr, "\n");
}
+/*
+ * Print out the results of a single counter:
+ * does not use aggregated count in system-wide
+ */
+static void print_counter(int counter)
+{
+ u64 ena, run, val;
+ int cpu;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ val = cpu_counts[cpu][counter].val;
+ ena = cpu_counts[cpu][counter].ena;
+ run = cpu_counts[cpu][counter].run;
+ if (run == 0 || ena == 0) {
+ fprintf(stderr, "CPU%-4d %18s %-24s", cpumap[cpu],
+ "<not counted>", event_name(counter));
+
+ fprintf(stderr, "\n");
+ continue;
+ }
+
+ if (nsec_counter(counter))
+ nsec_printout(cpu, counter, val);
+ else
+ abs_printout(cpu, counter, val);
+
+ print_noise(counter, 1.0);
+
+ if (run != ena) {
+ fprintf(stderr, " (scaled from %.2f%%)",
+ 100.0 * run / ena);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
static void print_stat(int argc, const char **argv)
{
int i, counter;
@@ -480,8 +598,13 @@ static void print_stat(int argc, const char **argv)
fprintf(stderr, " (%d runs)", run_count);
fprintf(stderr, ":\n\n");
- for (counter = 0; counter < nr_counters; counter++)
- print_counter(counter);
+ if (no_aggr) {
+ for (counter = 0; counter < nr_counters; counter++)
+ print_counter(counter);
+ } else {
+ for (counter = 0; counter < nr_counters; counter++)
+ print_counter_aggr(counter);
+ }
fprintf(stderr, "\n");
fprintf(stderr, " %18.9f seconds time elapsed",
@@ -545,6 +668,8 @@ static const struct option options[] = {
"print large numbers with thousands\' separators"),
OPT_STRING('C', "cpu", &cpu_list, "cpu",
"list of cpus to monitor in system-wide"),
+ OPT_BOOLEAN('A', "no-aggr", &no_aggr,
+ "disable CPU count aggregation"),
OPT_END()
};
@@ -562,6 +687,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
if (run_count <= 0)
usage_with_options(stat_usage, options);
+ /* no_aggr is for system-wide only */
+ if (no_aggr && !system_wide)
+ usage_with_options(stat_usage, options);
+
/* Set attrs and nr_counters if no event is selected and !null_run */
if (!null_run && !nr_counters) {
memcpy(attrs, default_attrs, sizeof(default_attrs));
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index dd625808c2a..3d2b47d5121 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1214,7 +1214,9 @@ try_again:
int err = errno;
if (err == EPERM || err == EACCES)
- die("No permission - are you root?\n");
+ die("Permission error - are you root?\n"
+ "\t Consider tweaking"
+ " /proc/sys/kernel/perf_event_paranoid.\n");
/*
* If it's cycles then fall back to hrtimer
* based cpu-clock-tick sw counter, which
@@ -1231,7 +1233,7 @@ try_again:
goto try_again;
}
printf("\n");
- error("perfcounter syscall returned with %d (%s)\n",
+ error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
fd[i][counter][thread_index], strerror(err));
die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
exit(-1);
diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak
index b253db634f0..b041ca67a2c 100644
--- a/tools/perf/feature-tests.mak
+++ b/tools/perf/feature-tests.mak
@@ -9,8 +9,8 @@ endef
ifndef NO_DWARF
define SOURCE_DWARF
#include <dwarf.h>
-#include <libdw.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
#ifndef _ELFUTILS_PREREQ
#error
#endif
diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h
new file mode 100644
index 00000000000..acffd5e4d1d
--- /dev/null
+++ b/tools/perf/util/include/asm/cpufeature.h
@@ -0,0 +1,9 @@
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h
new file mode 100644
index 00000000000..bb4198e7837
--- /dev/null
+++ b/tools/perf/util/include/asm/dwarf2.h
@@ -0,0 +1,11 @@
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h
new file mode 100644
index 00000000000..06387cffe12
--- /dev/null
+++ b/tools/perf/util/include/linux/linkage.h
@@ -0,0 +1,13 @@
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name) \
+ .globl name; \
+ name:
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index bba69d45569..beaefc3c122 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -34,9 +34,9 @@ extern int find_available_vars_at(int fd, struct perf_probe_event *pev,
bool externs);
#include <dwarf.h>
-#include <libdw.h>
-#include <libdwfl.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/libdwfl.h>
+#include <elfutils/version.h>
struct probe_finder {
struct perf_probe_event *pev; /* Target probe event */