From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:39:53 +0100
Subject: performance counters: x86 support

Implement performance counters for x86 Intel CPUs.

It's simplified right now: the PERFMON CPU feature is assumed,
which is available in Core2 and later Intel CPUs.

The design is flexible to be extended to more CPU types as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                               |   1 +
 arch/x86/ia32/ia32entry.S                      |   3 +-
 arch/x86/include/asm/hardirq_32.h              |   1 +
 arch/x86/include/asm/hw_irq.h                  |   2 +
 arch/x86/include/asm/intel_arch_perfmon.h      |  34 +-
 arch/x86/include/asm/irq_vectors.h             |   5 +
 arch/x86/include/asm/mach-default/entry_arch.h |   5 +
 arch/x86/include/asm/pda.h                     |   1 +
 arch/x86/include/asm/thread_info.h             |   4 +-
 arch/x86/include/asm/unistd_32.h               |   1 +
 arch/x86/include/asm/unistd_64.h               |   3 +-
 arch/x86/kernel/apic.c                         |   2 +
 arch/x86/kernel/cpu/Makefile                   |  12 +-
 arch/x86/kernel/cpu/common.c                   |   2 +
 arch/x86/kernel/cpu/perf_counter.c             | 571 +++++++++++++++++++++++++
 arch/x86/kernel/entry_64.S                     |   5 +
 arch/x86/kernel/irq.c                          |   5 +
 arch/x86/kernel/irqinit_32.c                   |   3 +
 arch/x86/kernel/irqinit_64.c                   |   5 +
 arch/x86/kernel/signal.c                       |   7 +-
 arch/x86/kernel/syscall_table_32.S             |   1 +
 21 files changed, 652 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_counter.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4d4cb7629e..f2fdc186724 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,6 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+	select HAVE_PERF_COUNTERS
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b6189..3c14ed07dc4 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
 	.quad sys_epoll_create1
-	.quad sys_dup3			/* 330 */
+	.quad sys_dup3				/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad sys_perf_counter_open
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e72f2..b3e475dc933 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
 	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
 	unsigned int apic_timer_irqs;	/* arch dependent */
+	unsigned int apic_perf_irqs;	/* arch dependent */
 	unsigned int irq0_irqs;
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b95..aa93e53b85e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void error_interrupt(void);
+extern void perf_counter_interrupt(void);
+
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
index fa0fd068bc2..71598a9eab6 100644
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -1,22 +1,24 @@
 #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
 #define _ASM_X86_INTEL_ARCH_PERFMON_H
 
-#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
-#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL0_ENABLE	(1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT	(1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS	(1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR	(1 << 16)
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
 union cpuid10_eax {
 	struct {
@@ -28,4 +30,12 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
 #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f94..b8d277f1252 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -86,6 +86,11 @@
  */
 #define LOCAL_TIMER_VECTOR	0xef
 
+/*
+ * Performance monitoring interrupt vector:
+ */
+#define LOCAL_PERF_VECTOR	0xee
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31d..ad31e5d90e9 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
  * a much simpler SMP time architecture:
  */
 #ifdef CONFIG_X86_LOCAL_APIC
+
 BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
+#ifdef CONFIG_PERF_COUNTERS
+BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+#endif
+
 #ifdef CONFIG_X86_MCE_P4THERMAL
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff88df3..90a8d9d4206 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
 	short isidle;
 	struct mm_struct *active_mm;
 	unsigned apic_timer_irqs;
+	unsigned apic_perf_irqs;
 	unsigned irq0_irqs;
 	unsigned irq_resched_count;
 	unsigned irq_call_count;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad..810bf266d13 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,6 +80,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -103,6 +104,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a..7e47658b0a6 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_perf_counter_open	333
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666..53025feaf88 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_perf_counter_open		295
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b52..8ab8c185867 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,6 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
+#include <asm/intel_arch_perfmon.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
+	perf_counters_lapic_init(0);
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c05..89e53361fe2 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
 #
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
@@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_X86_MCE)	+= mcheck/
-obj-$(CONFIG_MTRR)	+= mtrr/
-obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
+obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
 
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE)			+= mcheck/
+obj-$(CONFIG_MTRR)			+= mtrr/
+obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a..4461011db47 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
@@ -750,6 +751,7 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
+	init_hw_perf_counters();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..82440cbed0e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,571 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+
+#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+
+static bool perf_counters_initialized __read_mostly;
+
+/*
+ * Number of (generic) HW counters:
+ */
+static int nr_hw_counters __read_mostly;
+static u32 perf_counter_mask __read_mostly;
+
+/* No support for fixed function counters yet */
+
+#define MAX_HW_COUNTERS		8
+
+struct cpu_hw_counters {
+	struct perf_counter	*counters[MAX_HW_COUNTERS];
+	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	int			enable_all;
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+const int intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
+  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+
+/*
+ * Setup the hardware configuration for a given hw_event_type
+ */
+int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (unlikely(!perf_counters_initialized))
+		return -EINVAL;
+
+	/*
+	 * Count user events, and generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * If privileged enough, count OS events too, and allow
+	 * NMI events as well:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN)) {
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+		if (hw_event_type & PERF_COUNT_NMI)
+			hwc->nmi = 1;
+	}
+
+	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+
+	hwc->irq_period = counter->__irq_period;
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	if (!hwc->irq_period)
+		hwc->irq_period = 0x7FFFFFFF;
+
+	hwc->next_count = -((s32) hwc->irq_period);
+
+	/*
+	 * Negative event types mean raw encoded event+umask values:
+	 */
+	if (hw_event_type < 0) {
+		counter->hw_event_type = -hw_event_type;
+		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	} else {
+		hw_event_type &= ~PERF_COUNT_NMI;
+		if (hw_event_type >= max_intel_perfmon_events)
+			return -EINVAL;
+		/*
+		 * The generic map:
+		 */
+		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+	}
+	hwc->config |= counter->hw_event_type;
+	counter->wakeup_pending = 0;
+
+	return 0;
+}
+
+static void __hw_perf_enable_all(void)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+}
+
+void hw_perf_enable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 1;
+	__hw_perf_enable_all();
+}
+
+void hw_perf_disable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 0;
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+}
+
+static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+
+	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+void hw_perf_counter_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	/* Try to get the previous counter again */
+	if (test_and_set_bit(idx, cpuc->used)) {
+		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		set_bit(idx, cpuc->used);
+		hwc->idx = idx;
+	}
+
+	perf_counters_lapic_init(hwc->nmi);
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	cpuc->counters[idx] = counter;
+	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	__hw_perf_counter_enable(hwc, idx);
+}
+
+#ifdef CONFIG_X86_64
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+#else
+/*
+ * Todo: add proper atomic64_t support to 32-bit x86:
+ */
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+#endif
+
+static void __hw_perf_save_counter(struct perf_counter *counter,
+				   struct hw_perf_counter *hwc, int idx)
+{
+	s64 raw = -1;
+	s64 delta;
+	int err;
+
+	/*
+	 * Get the raw hw counter value:
+	 */
+	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
+	WARN_ON_ONCE(err);
+
+	/*
+	 * Rebase it to zero (it started counting at -irq_period),
+	 * to see the delta since ->prev_count:
+	 */
+	delta = (s64)hwc->irq_period + (s64)(s32)raw;
+
+	atomic64_counter_set(counter, hwc->prev_count + delta);
+
+	/*
+	 * Adjust the ->prev_count offset - if we went beyond
+	 * irq_period of units, then we got an IRQ and the counter
+	 * was set back to -irq_period:
+	 */
+	while (delta >= (s64)hwc->irq_period) {
+		hwc->prev_count += hwc->irq_period;
+		delta -= (s64)hwc->irq_period;
+	}
+
+	/*
+	 * Calculate the next raw counter value we'll write into
+	 * the counter at the next sched-in time:
+	 */
+	delta -= (s64)hwc->irq_period;
+
+	hwc->next_count = (s32)delta;
+}
+
+void perf_counter_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	int cpu, err, idx;
+
+	local_irq_disable();
+
+	cpu = smp_processor_id();
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
+	WARN_ON_ONCE(err);
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
+	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+
+	for (idx = 0; idx < nr_hw_counters; idx++) {
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+		WARN_ON_ONCE(err);
+
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
+		WARN_ON_ONCE(err);
+
+		next_count = per_cpu(prev_next_count[idx], cpu);
+
+		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
+			cpu, idx, next_count);
+	}
+	local_irq_enable();
+}
+
+void hw_perf_counter_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned int idx = hwc->idx;
+
+	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+
+	clear_bit(idx, cpuc->used);
+	cpuc->counters[idx] = NULL;
+	__hw_perf_save_counter(counter, hwc, idx);
+}
+
+void hw_perf_counter_read(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned long addr = hwc->counter_base + hwc->idx;
+	s64 offs, val = -1LL;
+	s32 val32;
+	int err;
+
+	/* Careful: NMI might modify the counter offset */
+	do {
+		offs = hwc->prev_count;
+		err = rdmsrl_safe(addr, &val);
+		WARN_ON_ONCE(err);
+	} while (offs != hwc->prev_count);
+
+	val32 = (s32) val;
+	val =  (s64)hwc->irq_period + (s64)val32;
+	atomic64_counter_set(counter, hwc->prev_count + val);
+}
+
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_save_and_restart(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+		__hw_perf_save_counter(counter, hwc, idx);
+		__hw_perf_counter_enable(hwc, idx);
+	}
+}
+
+static void
+perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+{
+	struct perf_counter_context *ctx = leader->ctx;
+	struct perf_counter *counter;
+	int bit;
+
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		    counter == leader)
+			continue;
+
+		if (counter->active) {
+			/*
+			 * When counter was not in the overflow mask, we have to
+			 * read it from hardware. We read it as well, when it
+			 * has not been read yet and clear the bit in the
+			 * status mask.
+			 */
+			bit = counter->hw.idx;
+			if (!test_bit(bit, (unsigned long *) overflown) ||
+			    test_bit(bit, (unsigned long *) status)) {
+				clear_bit(bit, (unsigned long *) status);
+				perf_save_and_restart(counter);
+			}
+		}
+		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, atomic64_counter_read(counter));
+	}
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+{
+	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status) {
+		ack_APIC_irq();
+		return;
+	}
+
+	/* Disable counters globally */
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	ack_APIC_irq();
+
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+again:
+	ack = status;
+	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		clear_bit(bit, (unsigned long *) &status);
+		if (!counter)
+			continue;
+
+		perf_save_and_restart(counter);
+
+		switch (counter->record_type) {
+		case PERF_RECORD_SIMPLE:
+			continue;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			break;
+		case PERF_RECORD_GROUP:
+			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    atomic64_counter_read(counter));
+			perf_handle_group(counter, &status, &ack);
+			break;
+		}
+		/*
+		 * From NMI context we cannot call into the scheduler to
+		 * do a task wakeup - but we mark these counters as
+		 * wakeup_pending and initate a wakeup callback:
+		 */
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		} else {
+			wake_up(&counter->waitq);
+		}
+	}
+
+	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (status)
+		goto again;
+
+	/*
+	 * Do not reenable when global enable is off:
+	 */
+	if (cpuc->enable_all)
+		__hw_perf_enable_all();
+}
+
+void smp_perf_counter_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+#ifdef CONFIG_X86_64
+	add_pda(apic_perf_irqs, 1);
+#else
+	per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++;
+#endif
+	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	__smp_perf_counter_interrupt(regs, 0);
+
+	irq_exit();
+}
+
+/*
+ * This handler is triggered by NMI contexts:
+ */
+void perf_counter_notify(struct pt_regs *regs)
+{
+	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
+	int bit, cpu;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	for_each_bit(bit, cpuc->used, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		if (!counter)
+			continue;
+
+		if (counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+void __cpuinit perf_counters_lapic_init(int nmi)
+{
+	u32 apic_val;
+
+	if (!perf_counters_initialized)
+		return;
+	/*
+	 * Enable the performance counter vector in the APIC LVT:
+	 */
+	apic_val = apic_read(APIC_LVTERR);
+
+	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
+	if (nmi)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	else
+		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	apic_write(APIC_LVTERR, apic_val);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+
+	if (likely(cmd != DIE_NMI_IPI))
+		return NOTIFY_DONE;
+
+	regs = args->regs;
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	__smp_perf_counter_interrupt(regs, 1);
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+	.notifier_call		= perf_counter_nmi_handler
+};
+
+void __init init_hw_perf_counters(void)
+{
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int ebx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired Event or not.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+		return;
+
+	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
+
+	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	nr_hw_counters = eax.split.num_counters;
+	if (nr_hw_counters > MAX_HW_COUNTERS) {
+		nr_hw_counters = MAX_HW_COUNTERS;
+		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+			nr_hw_counters, MAX_HW_COUNTERS);
+	}
+	perf_counter_mask = (1 << nr_hw_counters) - 1;
+	perf_max_counters = nr_hw_counters;
+
+	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+
+	perf_counters_lapic_init(0);
+	register_die_notifier(&perf_counter_nmi_notifier);
+
+	perf_counters_initialized = true;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3194636a429..fc013cfde30 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PERF_VECTOR \
+	perf_counter_interrupt smp_perf_counter_interrupt
+#endif
+
 /*
  * Exception entry points.
  */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f64..d92bc71e41a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+	seq_printf(p, "CNT: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+	seq_printf(p, "  Performance counter interrupts\n");
 #endif
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->apic_perf_irqs;
 #endif
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a..6a33b5e3016 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -160,6 +160,9 @@ void __init native_init_IRQ(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+# ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+# endif
 #endif
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626..91d785c25ad 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -138,6 +138,11 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* Performance monitoring interrupt: */
+#ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+#endif
 }
 
 void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b1cc6da6420..dee553c503d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-
+#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
+	if (thread_info_flags & _TIF_PERF_COUNTERS) {
+		clear_thread_flag(TIF_PERF_COUNTERS);
+		perf_counter_notify(regs);
+	}
+
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c..496726ddcea 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_perf_counter_open
-- 
cgit v1.2.3-70-g09d2


From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 14:20:16 +0100
Subject: x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters
 disabled

Impact: make perfcounter NMI and IRQ sequence more robust

Make __smp_perf_counter_interrupt() a bit more conservative: first disable
all counters, then read out the status. Most invocations are because there
are real events, so there's no performance impact.

Code flow gets a bit simpler as well this way.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82440cbed0e..615e953208e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc;
 	u64 ack, status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	if (!status) {
-		ack_APIC_irq();
-		return;
-	}
-
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status)
+		goto out;
+
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
@@ -440,7 +438,7 @@ again:
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (status)
 		goto again;
-
+out:
 	/*
 	 * Do not reenable when global enable is off:
 	 */
-- 
cgit v1.2.3-70-g09d2


From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 11:40:46 +0100
Subject: perfcounters, x86: simplify disable/enable of counters

Impact: fix spurious missed counter wakeups

In the case of NMI events, close a race window that can occur if an NMI
hits counter code that temporarily disables+enables a counter, and the NMI
leaks into the disabled section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 40 ++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 615e953208e..7d528ffc2d2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -136,14 +136,25 @@ void hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
+static inline void
+__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+{
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
 static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 {
 	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
 
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	wrmsr(hwc->config_base + idx,
+	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 void hw_perf_counter_enable(struct perf_counter *counter)
@@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	__hw_perf_counter_set_period(hwc, idx);
 	__hw_perf_counter_enable(hwc, idx);
 }
 
@@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 	}
 }
 
+/*
+ * NMI-safe enable method:
+ */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	u64 pmc_ctrl;
+	int err;
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+	WARN_ON_ONCE(err);
 
-	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-		__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_counter_set_period(hwc, idx);
+
+	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
 		__hw_perf_counter_enable(hwc, idx);
-	}
 }
 
 static void
-- 
cgit v1.2.3-70-g09d2


From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:18:18 +0100
Subject: perfcounters, x86: clean up debug code

Impact: cleanup

Get rid of unused debug code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d528ffc2d2..919ec46679b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 {
 	s64 raw = -1;
 	s64 delta;
-	int err;
 
 	/*
 	 * Get the raw hw counter value:
 	 */
-	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
-	WARN_ON_ONCE(err);
+	rdmsrl(hwc->counter_base + idx, raw);
 
 	/*
 	 * Rebase it to zero (it started counting at -irq_period),
@@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
-	int cpu, err, idx;
+	int cpu, idx;
+
+	if (!nr_hw_counters)
+		return;
 
 	local_irq_disable();
 
 	cpu = smp_processor_id();
 
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
@@ -273,11 +269,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
 	for (idx = 0; idx < nr_hw_counters; idx++) {
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-		WARN_ON_ONCE(err);
-
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
-		WARN_ON_ONCE(err);
+		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
+		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
 		next_count = per_cpu(prev_next_count[idx], cpu);
 
@@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter)
 	unsigned long addr = hwc->counter_base + hwc->idx;
 	s64 offs, val = -1LL;
 	s32 val32;
-	int err;
 
 	/* Careful: NMI might modify the counter offset */
 	do {
 		offs = hwc->prev_count;
-		err = rdmsrl_safe(addr, &val);
-		WARN_ON_ONCE(err);
+		rdmsrl(addr, val);
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
@@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
 	u64 pmc_ctrl;
-	int err;
 
-	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	__hw_perf_save_counter(counter, hwc, idx);
 	__hw_perf_counter_set_period(hwc, idx);
-- 
cgit v1.2.3-70-g09d2


From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:23:59 +0100
Subject: perfcounters: consolidate global-disable codepaths

Impact: cleanup

Simplify global disable handling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 919ec46679b..6a93d1f04d9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[MAX_HW_COUNTERS];
 	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
-	int			enable_all;
 };
 
 /*
@@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	return 0;
 }
 
-static void __hw_perf_enable_all(void)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
-}
-
 void hw_perf_enable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 1;
-	__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
 void hw_perf_disable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 0;
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
@@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
+	u64 ack, status, saved_global;
 	struct cpu_hw_counters *cpuc;
-	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
@@ -445,10 +435,9 @@ again:
 		goto again;
 out:
 	/*
-	 * Do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off:
 	 */
-	if (cpuc->enable_all)
-		__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v1.2.3-70-g09d2


From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Dec 2008 21:43:39 +0100
Subject: perf counters: protect them against CSTATE transitions

Impact: fix rare lost events problem

There are CPUs whose performance counters misbehave on CSTATE transitions,
so provide a way to just disable/enable them around deep idle methods.

(hw_perf_enable_all() is cheap on x86.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++-
 drivers/acpi/processor_idle.c      |  8 ++++++++
 include/linux/perf_counter.h       |  4 ++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6a93d1f04d9..0a7f3bea2dc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -12,6 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
@@ -119,10 +120,21 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_disable_all(void)
+void hw_perf_restore_ctrl(u64 ctrl)
 {
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+
+u64 hw_perf_disable_all(void)
+{
+	u64 ctrl;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	return ctrl;
 }
+EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
 __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5f8d746a9b8..cca804e6f1d 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,8 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 22c4469abf4..5031b5614f2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void hw_perf_restore_ctrl(u64 ctrl);
+extern u64 hw_perf_disable_all(void);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
+static inline u64 hw_perf_disable_all(void)		{ return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
-- 
cgit v1.2.3-70-g09d2


From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:35:37 +0100
Subject: perf counters: expand use of counter->event

Impact: change syscall, cleanup

Make use of the new perf_counters event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++-----------
 include/linux/perf_counter.h       |  4 +---
 kernel/perf_counter.c              | 10 +++++-----
 3 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0a7f3bea2dc..30e7ebf7827 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+int hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
 	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->__irq_period;
+	hwc->irq_period = counter->event.hw_event_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->next_count = -((s32) hwc->irq_period);
 
 	/*
-	 * Negative event types mean raw encoded event+umask values:
+	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event_type < 0) {
-		counter->hw_event_type = -hw_event_type;
-		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	hw_event_type &= ~PERF_COUNT_NMI;
+	if (hw_event_type == PERF_COUNT_RAW) {
+		hwc->config |= counter->event.hw_raw_ctrl;
 	} else {
-		hw_event_type &= ~PERF_COUNT_NMI;
 		if (hw_event_type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event_type];
 	}
-	hwc->config |= counter->hw_event_type;
 	counter->wakeup_pending = 0;
 
 	return 0;
@@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, counter->event.hw_event_type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -418,7 +417,8 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    counter->event.hw_event_type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index daedd7d87c2..1f0017673e7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -96,8 +96,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	u64				__irq_period;
-
+	struct perf_counter_event	event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -111,7 +110,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	s32				hw_event_type;
 	enum perf_record_type		record_type;
 
 	/* read() / irq related data */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 96c333a5b0f..2557c670a3b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
  * Architecture provided APIs - weak aliases:
  */
 
-int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+int __weak hw_perf_counter_init(struct perf_counter *counter)
 {
 	return -EINVAL;
 }
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
 	counter->usrdata	= &counter->data[1];
 	counter->cpu		= cpu;
 	counter->record_type	= record_type;
-	counter->__irq_period	= hw_event_period;
+	counter->event		= *event;
 	counter->wakeup_pending = 0;
 
 	return counter;
@@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(&event, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, event.hw_event_type);
+	ret = hw_perf_counter_init(counter);
 	if (ret)
 		goto err_free_put_context;
 
-- 
cgit v1.2.3-70-g09d2


From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 10 Dec 2008 12:33:23 +0100
Subject: perf counters: restructure the API

Impact: clean up new API

Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:

 - introduce perf_counter_hw_event to separate out the event source details

 - move special type flags into separate attributes: PERF_COUNT_NMI,
   PERF_COUNT_RAW

 - extend the type to u64 and reserve it fully to the architecture in the
   raw type case.

And make use of all these changes in the core and x86 perfcounters code.

Also change the syscall signature to:

  asmlinkage int sys_perf_counter_open(

	struct perf_counter_hw_event	*hw_event_uptr		__user,
	pid_t				pid,
	int				cpu,
	int				group_fd);

( Note that group_fd is unused for now - it's reserved for the counter
  groups abstraction. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 29 ++++++-----
 include/linux/perf_counter.h       | 98 ++++++++++++++++++++++++--------------
 include/linux/syscalls.h           | 12 +++--
 kernel/perf_counter.c              | 38 ++++++++-------
 4 files changed, 106 insertions(+), 71 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 30e7ebf7827..ef1936a871a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  */
 int hw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
-	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi = 0;
 	if (capable(CAP_SYS_ADMIN)) {
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event_type & PERF_COUNT_NMI)
+		if (hw_event->nmi)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->event.hw_event_period;
+	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->irq_period)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count = -((s32) hwc->irq_period);
+	hwc->next_count	= -(s32)hwc->irq_period;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	hw_event_type &= ~PERF_COUNT_NMI;
-	if (hw_event_type == PERF_COUNT_RAW) {
-		hwc->config |= counter->event.hw_raw_ctrl;
+	if (hw_event->raw) {
+		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event_type >= max_intel_perfmon_events)
+		if (hw_event->type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event->type];
 	}
 	counter->wakeup_pending = 0;
 
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 	int bit;
 
 	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
 		    counter == leader)
 			continue;
 
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->event.hw_event_type);
+		perf_store_irq_data(leader, counter->hw_event.type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -410,7 +409,7 @@ again:
 
 		perf_save_and_restart(counter);
 
-		switch (counter->record_type) {
+		switch (counter->hw_event.record_type) {
 		case PERF_RECORD_SIMPLE:
 			continue;
 		case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ again:
 			break;
 		case PERF_RECORD_GROUP:
 			perf_store_irq_data(counter,
-					    counter->event.hw_event_type);
+					    counter->hw_event.type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1f0017673e7..a2b4852e2d7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,65 +24,93 @@
 struct task_struct;
 
 /*
- * Generalized hardware event types, used by the hw_event_type parameter
- * of the sys_perf_counter_open() syscall:
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
  */
 enum hw_event_types {
-	PERF_COUNT_CYCLES,
-	PERF_COUNT_INSTRUCTIONS,
-	PERF_COUNT_CACHE_REFERENCES,
-	PERF_COUNT_CACHE_MISSES,
-	PERF_COUNT_BRANCH_INSTRUCTIONS,
-	PERF_COUNT_BRANCH_MISSES,
 	/*
-	 * If this bit is set in the type, then trigger NMI sampling:
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_NMI			= (1 << 30),
-	PERF_COUNT_RAW			= (1 << 31),
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
  * IRQ-notification data record type:
  */
-enum perf_record_type {
-	PERF_RECORD_SIMPLE,
-	PERF_RECORD_IRQ,
-	PERF_RECORD_GROUP,
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
 };
 
-struct perf_counter_event {
-	u32			hw_event_type;
-	u32			hw_event_period;
-	u64			hw_raw_ctrl;
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	u64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
 };
 
+/*
+ * Kernel-internal data types:
+ */
+
 /**
- * struct hw_perf_counter - performance counter hardware details
+ * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
-	u64			config;
-	unsigned long		config_base;
-	unsigned long		counter_base;
-	int			nmi;
-	unsigned int		idx;
-	u64			prev_count;
-	s32			next_count;
-	u64			irq_period;
+	u64				config;
+	unsigned long			config_base;
+	unsigned long			counter_base;
+	int				nmi;
+	unsigned int			idx;
+	u64				prev_count;
+	u64				irq_period;
+	s32				next_count;
 };
 
 /*
  * Hardcoded buffer length limit for now, for IRQ-fed events:
  */
-#define PERF_DATA_BUFLEN	2048
+#define PERF_DATA_BUFLEN		2048
 
 /**
  * struct perf_data - performance counter IRQ data sampling ...
  */
 struct perf_data {
-	int			len;
-	int			rd_idx;
-	int			overrun;
-	u8			data[PERF_DATA_BUFLEN];
+	int				len;
+	int				rd_idx;
+	int				overrun;
+	u8				data[PERF_DATA_BUFLEN];
 };
 
 /**
@@ -96,7 +124,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	struct perf_counter_event	event;
+	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -110,8 +138,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	enum perf_record_type		record_type;
-
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3ecd73d03da..a549678b7c3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,7 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
-struct perf_counter_event;
+struct perf_counter_hw_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd);
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2557c670a3b..0d323ceda3a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->record_type) {
+	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		return perf_read_hw(counter, buf, count);
 
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 	INIT_LIST_HEAD(&counter->list);
 	init_waitqueue_head(&counter->waitq);
 
-	counter->irqdata	= &counter->data[0];
-	counter->usrdata	= &counter->data[1];
-	counter->cpu		= cpu;
-	counter->record_type	= record_type;
-	counter->event		= *event;
-	counter->wakeup_pending = 0;
+	counter->irqdata		= &counter->data[0];
+	counter->usrdata		= &counter->data[1];
+	counter->cpu			= cpu;
+	counter->hw_event		= *hw_event;
+	counter->wakeup_pending		= 0;
 
 	return counter;
 }
 
 /**
- * sys_perf_task_open - open a performance counter associate it to a task
- * @hw_event_type:	event type for monitoring/sampling...
+ * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ *
+ * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
+ * @cpu:		target cpu
+ * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd)
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd)
+
 {
 	struct perf_counter_context *ctx;
-	struct perf_counter_event event;
+	struct perf_counter_hw_event hw_event;
 	struct perf_counter *counter;
 	int ret;
 
-	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
 	ctx = find_get_context(pid, cpu);
@@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&event, cpu, record_type);
+	counter = perf_counter_alloc(&hw_event, cpu);
 	if (!counter)
 		goto err_put_context;
 
-- 
cgit v1.2.3-70-g09d2


From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 08:38:42 +0100
Subject: perf counters: add support for group counters

Impact: add group counters

This patch adds the "counter groups" abstraction.

Groups of counters behave much like normal 'single' counters, with a
few semantic and behavioral extensions on top of that.

A counter group is created by creating a new counter with the open()
syscall's group-leader group_fd file descriptor parameter pointing
to another, already existing counter.

Groups of counters are scheduled in and out in one atomic group, and
they are also roundrobin-scheduled atomically.

Counters that are member of a group can also record events with an
(atomic) extended timestamp that extends to all members of the group,
if the record type is set to PERF_RECORD_GROUP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  28 ++--
 include/linux/perf_counter.h       |   8 +-
 kernel/perf_counter.c              | 282 ++++++++++++++++++++++++++++---------
 3 files changed, 236 insertions(+), 82 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ef1936a871a..54b4ad0cce6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter)
 }
 
 static void
-perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
-	struct perf_counter_context *ctx = leader->ctx;
-	struct perf_counter *counter;
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
 	int bit;
 
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
-		    counter == leader)
-			continue;
+	/*
+	 * Store the counter's own timestamp first:
+	 */
+	perf_store_irq_data(sibling, sibling->hw_event.type);
+	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
-		if (counter->active) {
+	/*
+	 * Then store sibling timestamps (if any):
+	 */
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		if (!counter->active) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
@@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event.type);
-		perf_store_irq_data(leader, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, atomic64_counter_read(counter));
 	}
 }
 
@@ -416,10 +420,6 @@ again:
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter,
-					    counter->hw_event.type);
-			perf_store_irq_data(counter,
-					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
 			break;
 		}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a2b4852e2d7..7af7d896546 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -117,7 +117,10 @@ struct perf_data {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
-	struct list_head		list;
+	struct list_head		list_entry;
+	struct list_head		sibling_list;
+	struct perf_counter		*group_leader;
+
 	int				active;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
@@ -158,7 +161,8 @@ struct perf_counter_context {
 	 * Protect the list of counters:
 	 */
 	spinlock_t		lock;
-	struct list_head	counters;
+
+	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0d323ceda3a..fa59fe8c02d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
 #include <linux/ptrace.h>
@@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { }
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 64 bit version - no complications.
  */
-static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	return (u64) atomic64_read(&counter->count);
 }
@@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter)
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 32 bit version.
  */
-static u64 perf_read_counter_safe(struct perf_counter *counter)
+static u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	u32 cntl, cnth;
 
@@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter)
 
 #endif
 
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *group_leader = counter->group_leader;
+
+	/*
+	 * Depending on whether it is a standalone or sibling counter,
+	 * add it straight to the context's counter list, or to the group
+	 * leader's sibling list:
+	 */
+	if (counter->group_leader == counter)
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
+	else
+		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+}
+
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *sibling, *tmp;
+
+	list_del_init(&counter->list_entry);
+
+	if (list_empty(&counter->sibling_list))
+		return;
+
+	/*
+	 * If this was a group counter with sibling counters then
+	 * upgrade the siblings to singleton counters by adding them
+	 * to the context list directly:
+	 */
+	list_for_each_entry_safe(sibling, tmp,
+				 &counter->sibling_list, list_entry) {
+
+		list_del_init(&sibling->list_entry);
+		list_add_tail(&sibling->list_entry, &ctx->counter_list);
+		WARN_ON_ONCE(!sibling->group_leader);
+		WARN_ON_ONCE(sibling->group_leader == sibling);
+		sibling->group_leader = sibling;
+	}
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
  * remove it from the context list.
  */
-static void __perf_remove_from_context(void *info)
+static void __perf_counter_remove_from_context(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
@@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_del_init(&counter->list);
+	list_del_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	if (!ctx->task) {
@@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info)
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
  */
-static void perf_remove_from_context(struct perf_counter *counter)
+static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
@@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter)
 		 * the removal is always sucessful.
 		 */
 		smp_call_function_single(counter->cpu,
-					 __perf_remove_from_context,
+					 __perf_counter_remove_from_context,
 					 counter, 1);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_remove_from_context,
+	task_oncpu_function_call(task, __perf_counter_remove_from_context,
 				 counter);
 
 	spin_lock_irq(&ctx->lock);
 	/*
 	 * If the context is active we need to retry the smp call.
 	 */
-	if (ctx->nr_active && !list_empty(&counter->list)) {
+	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
 	 * The lock prevents that this context is scheduled in so we
-	 * can remove the counter safely, if it the call above did not
+	 * can remove the counter safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&counter->list)) {
+	if (!list_empty(&counter->list_entry)) {
 		ctx->nr_counters--;
-		list_del_init(&counter->list);
+		list_del_counter(counter, ctx);
 		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
@@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_add_tail(&counter->list, &ctx->counters);
+	list_add_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	ctx->nr_counters++;
@@ -268,7 +311,7 @@ retry:
 	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list)) {
+	if (ctx->nr_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -278,13 +321,45 @@ retry:
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list)) {
-		list_add_tail(&counter->list, &ctx->counters);
+	if (list_empty(&counter->list_entry)) {
+		list_add_counter(counter, ctx);
 		ctx->nr_counters++;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_disable(counter);
+	counter->active	=  0;
+	counter->oncpu	= -1;
+
+	cpuctx->active_oncpu--;
+	ctx->nr_active--;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (!ctx->nr_active)
-			break;
-		if (counter->active) {
-			hw_perf_counter_disable(counter);
-			counter->active = 0;
-			counter->oncpu = -1;
-			ctx->nr_active--;
-			cpuctx->active_oncpu--;
-		}
+	if (ctx->nr_active) {
+		list_for_each_entry(counter, &ctx->counter_list, list_entry)
+			group_sched_out(counter, cpuctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 	cpuctx->task_ctx = NULL;
 }
 
+static void
+counter_sched_in(struct perf_counter *counter,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_counter_context *ctx,
+		 int cpu)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_enable(counter);
+	counter->active = 1;
+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+
+	cpuctx->active_oncpu++;
+	ctx->nr_active++;
+}
+
+static void
+group_sched_in(struct perf_counter *group_counter,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx,
+	       int cpu)
+{
+	struct perf_counter *counter;
+
+	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_in(counter, cpuctx, ctx, cpu);
+}
+
 /*
  * Called from scheduler to add the counters of the current task
  * with interrupts disabled.
@@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (ctx->nr_active == cpuctx->max_pertask)
 			break;
+
+		/*
+		 * Listen to the 'cpu' scheduling filter constraint
+		 * of counters:
+		 */
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		hw_perf_counter_enable(counter);
-		counter->active = 1;
-		counter->oncpu = cpu;
-		ctx->nr_active++;
-		cpuctx->active_oncpu++;
+		group_sched_in(counter, cpuctx, ctx, cpu);
 	}
 	spin_unlock(&ctx->lock);
+
 	cpuctx->task_ctx = ctx;
 }
 
@@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	spin_lock(&ctx->lock);
 
 	/*
-	 * Rotate the first entry last:
+	 * Rotate the first entry last (works just fine for group counters too):
 	 */
 	hw_perf_disable_all();
-	list_for_each_entry(counter, &ctx->counters, list) {
-		list_del(&counter->list);
-		list_add_tail(&counter->list, &ctx->counters);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		list_del(&counter->list_entry);
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
 	hw_perf_enable_all();
@@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	perf_counter_task_sched_in(curr, cpu);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->nr_counters	= 0;
+	ctx->task		= task;
+}
 /*
  * Initialize the perf_counter context in task_struct
  */
 void perf_counter_init_task(struct task_struct *task)
 {
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
-
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counters);
-	ctx->nr_counters = 0;
-	ctx->task = task;
+	__perf_counter_init_context(&task->perf_counter_ctx, task);
 }
 
 /*
@@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info)
 	hw_perf_counter_read(info);
 }
 
-static u64 perf_read_counter(struct perf_counter *counter)
+static u64 perf_counter_read(struct perf_counter *counter)
 {
 	/*
 	 * If counter is enabled and currently active on a CPU, update the
@@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_read_counter_safe(counter);
+	return perf_counter_read_safe(counter);
 }
 
 /*
@@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	mutex_lock(&counter->mutex);
 
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
@@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return -EINVAL;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_read_counter(counter);
+	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
 
 	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
@@ -707,15 +818,25 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+		   int cpu,
+		   struct perf_counter *group_leader)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
 	if (!counter)
 		return NULL;
 
+	/*
+	 * Single counters are their own group leaders, with an
+	 * empty sibling list:
+	 */
+	if (!group_leader)
+		group_leader = counter;
+
 	mutex_init(&counter->mutex);
-	INIT_LIST_HEAD(&counter->list);
+	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
 	counter->irqdata		= &counter->data[0];
@@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
+	counter->group_leader		= group_leader;
 
 	return counter;
 }
@@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open(
 	int				group_fd)
 
 {
-	struct perf_counter_context *ctx;
+	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-	struct perf_counter *counter;
+	struct perf_counter_context *ctx;
+	struct file *group_file = NULL;
+	int fput_needed = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
+	/*
+	 * Look up the group leader:
+	 */
+	group_leader = NULL;
+	if (group_fd != -1) {
+		ret = -EINVAL;
+		group_file = fget_light(group_fd, &fput_needed);
+		if (!group_file)
+			goto out_fput;
+		if (group_file->f_op != &perf_fops)
+			goto out_fput;
+
+		group_leader = group_file->private_data;
+		/*
+		 * Do not allow a recursive hierarchy:
+		 */
+		if (group_leader->group_leader)
+			goto out_fput;
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&hw_event, cpu);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
 
@@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open(
 	if (ret < 0)
 		goto err_remove_free_put_context;
 
+out_fput:
+	fput_light(group_file, fput_needed);
+
 	return ret;
 
 err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
 
 err_free_put_context:
@@ -783,40 +933,40 @@ err_free_put_context:
 err_put_context:
 	put_context(ctx);
 
-	return ret;
+	goto out_fput;
 }
 
-static void __cpuinit perf_init_cpu(int cpu)
+static void __cpuinit perf_counter_init_cpu(int cpu)
 {
-	struct perf_cpu_context *ctx;
+	struct perf_cpu_context *cpuctx;
 
-	ctx = &per_cpu(perf_cpu_context, cpu);
-	spin_lock_init(&ctx->ctx.lock);
-	INIT_LIST_HEAD(&ctx->ctx.counters);
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	__perf_counter_init_context(&cpuctx->ctx, NULL);
 
 	mutex_lock(&perf_resource_mutex);
-	ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
 	mutex_unlock(&perf_resource_mutex);
+
 	hw_perf_counter_setup();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_exit_cpu(void *info)
+static void __perf_counter_exit_cpu(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = &cpuctx->ctx;
 	struct perf_counter *counter, *tmp;
 
-	list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
-		__perf_remove_from_context(counter);
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+		__perf_counter_remove_from_context(counter);
 
 }
-static void perf_exit_cpu(int cpu)
+static void perf_counter_exit_cpu(int cpu)
 {
-	smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
 }
 #else
-static inline void perf_exit_cpu(int cpu) { }
+static inline void perf_counter_exit_cpu(int cpu) { }
 #endif
 
 static int __cpuinit
@@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		perf_init_cpu(cpu);
+		perf_counter_init_cpu(cpu);
 		break;
 
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
-		perf_exit_cpu(cpu);
+		perf_counter_exit_cpu(cpu);
 		break;
 
 	default:
-- 
cgit v1.2.3-70-g09d2


From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 12:46:46 +0100
Subject: perf counters: hw driver API

Impact: restructure code, introduce hw_ops driver abstraction

Introduce this abstraction to handle counter details:

 struct hw_perf_counter_ops {
	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };

This will be useful to support assymetric hw details, and it will also
be useful to implement "software counters". (Counters that count kernel
managed sw events such as pagefaults, context-switches, wall-clock time
or task-local time.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++++++---------
 include/linux/perf_counter.h       | 15 +++++++++++++
 kernel/perf_counter.c              | 45 ++++++++++++++++++++------------------
 3 files changed, 66 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 54b4ad0cce6..718b635dece 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void)
 EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
-__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
 {
 	wrmsr(hwc->config_base + idx, hwc->config, 0);
 }
@@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
 }
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-void hw_perf_counter_enable(struct perf_counter *counter)
+static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(hwc, idx);
-	__hw_perf_counter_enable(hwc, idx);
+	__x86_perf_counter_enable(hwc, idx);
 }
 
 #ifdef CONFIG_X86_64
@@ -282,20 +282,20 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-void hw_perf_counter_disable(struct perf_counter *counter)
+static void x86_perf_counter_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
 	__hw_perf_save_counter(counter, hwc, idx);
 }
 
-void hw_perf_counter_read(struct perf_counter *counter)
+static void x86_perf_counter_read(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned long addr = hwc->counter_base + hwc->idx;
@@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__hw_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(hwc, idx);
 }
 
 static void
@@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void)
 
 	perf_counters_initialized = true;
 }
+
+static struct hw_perf_counter_ops x86_perf_counter_ops = {
+	.hw_perf_counter_enable		= x86_perf_counter_enable,
+	.hw_perf_counter_disable	= x86_perf_counter_disable,
+	.hw_perf_counter_read		= x86_perf_counter_read,
+};
+
+struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+{
+	int err;
+
+	err = __hw_perf_counter_init(counter);
+	if (err)
+		return NULL;
+
+	return &x86_perf_counter_ops;
+}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7af7d896546..27385641ecb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -113,6 +113,17 @@ struct perf_data {
 	u8				data[PERF_DATA_BUFLEN];
 };
 
+struct perf_counter;
+
+/**
+ * struct hw_perf_counter_ops - performance counter hw ops
+ */
+struct hw_perf_counter_ops {
+	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -120,6 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
+	struct hw_perf_counter_ops	*hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -185,6 +197,9 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter);
+
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 278209c547a..e6e41ca9546 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-
-int __weak hw_perf_counter_init(struct perf_counter *counter)
+extern __weak struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
-	return -EINVAL;
+	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_counter_enable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_disable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_read(struct perf_counter *counter)		 { }
-void __weak hw_perf_disable_all(void) { }
-void __weak hw_perf_enable_all(void) { }
-void __weak hw_perf_counter_setup(void) { }
+void __weak hw_perf_disable_all(void)	 { }
+void __weak hw_perf_enable_all(void)	 { }
+void __weak hw_perf_counter_setup(void)	 { }
 
 #if BITS_PER_LONG == 64
 
@@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock(&ctx->lock);
 
 	if (counter->active) {
-		hw_perf_counter_disable(counter);
+		counter->hw_ops->hw_perf_counter_disable(counter);
 		counter->active = 0;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		hw_perf_counter_enable(counter);
+		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->active = 1;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
@@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!counter->active)
 		return;
 
-	hw_perf_counter_disable(counter);
+	counter->hw_ops->hw_perf_counter_disable(counter);
 	counter->active	=  0;
 	counter->oncpu	= -1;
 
@@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	hw_perf_counter_enable(counter);
+	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task)
  */
 static void __hw_perf_counter_read(void *info)
 {
-	hw_perf_counter_read(info);
+	struct perf_counter *counter = info;
+
+	counter->hw_ops->hw_perf_counter_read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	struct hw_perf_counter_ops *hw_ops;
+	struct perf_counter *counter;
 
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 	if (!counter)
 		return NULL;
 
@@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
+	counter->hw_ops			= NULL;
+
+	hw_ops = hw_perf_counter_init(counter);
+	if (!hw_ops) {
+		kfree(counter);
+		return NULL;
+	}
+	counter->hw_ops = hw_ops;
 
 	return counter;
 }
@@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open(
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter);
-	if (ret)
-		goto err_free_put_context;
-
 	perf_install_in_context(ctx, counter, cpu);
 
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -927,8 +932,6 @@ err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
-
-err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-- 
cgit v1.2.3-70-g09d2


From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:21:10 +0100
Subject: perf counters: implement PERF_COUNT_CPU_CLOCK

Impact: add new perf-counter type

The 'CPU clock' counter counts the amount of CPU clock time that is
elapsing, in nanoseconds. (regardless of how much of it the task is
spending on a CPU executing)

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 36 ++-------------
 include/linux/perf_counter.h       |  9 ++--
 kernel/perf_counter.c              | 95 ++++++++++++++++++++++++++++++++------
 3 files changed, 92 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 718b635dece..43c8e9a38b4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 	__x86_perf_counter_enable(hwc, idx);
 }
 
-#ifdef CONFIG_X86_64
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-#else
-/*
- * Todo: add proper atomic64_t support to 32-bit x86:
- */
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-#endif
-
 static void __hw_perf_save_counter(struct perf_counter *counter,
 				   struct hw_perf_counter *hwc, int idx)
 {
@@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter)
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
-	val =  (s64)hwc->irq_period + (s64)val32;
+	val = (s64)hwc->irq_period + (s64)val32;
 	atomic64_counter_set(counter, hwc->prev_count + val);
 }
 
@@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
-static struct hw_perf_counter_ops x86_perf_counter_ops = {
+static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
 	.hw_perf_counter_read		= x86_perf_counter_read,
 };
 
-struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 27385641ecb..9a1713a1be2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -131,7 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
-	struct hw_perf_counter_ops	*hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -197,7 +197,7 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern struct hw_perf_counter_ops *
+extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
@@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern void hw_perf_restore_ctrl(u64 ctrl);
 extern u64 hw_perf_disable_all(void);
+extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
+extern u64 atomic64_counter_read(struct perf_counter *counter);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		{ return 0; }
+static inline u64 hw_perf_disable_all(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e6e41ca9546..506286e5ba6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak struct hw_perf_counter_ops *
+extern __weak const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
 	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_disable_all(void)	 { }
-void __weak hw_perf_enable_all(void)	 { }
-void __weak hw_perf_counter_setup(void)	 { }
+u64 __weak hw_perf_disable_all(void)		{ return 0; }
+void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
 
@@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 	return (u64) atomic64_read(&counter->count);
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+
 #else
 
 /*
@@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter)
 	return cntl | ((u64) cnth) << 32;
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+
 #endif
 
 static void
@@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_del_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_add_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
@@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
@@ -807,6 +834,42 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
+static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+	int cpu = raw_smp_processor_id();
+
+	atomic64_counter_set(counter, cpu_clock(cpu));
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+};
+
+static const struct hw_perf_counter_ops *
+sw_perf_counter_init(struct perf_counter *counter)
+{
+	const struct hw_perf_counter_ops *hw_ops = NULL;
+
+	switch (counter->hw_event.type) {
+	case PERF_COUNT_CPU_CLOCK:
+		hw_ops = &perf_ops_cpu_clock;
+		break;
+	default:
+		break;
+	}
+	return hw_ops;
+}
+
 /*
  * Allocate and initialize a counter structure
  */
@@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct hw_perf_counter_ops *hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
 	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
-	hw_ops = hw_perf_counter_init(counter);
+	hw_ops = NULL;
+	if (!hw_event->raw && hw_event->type < 0)
+		hw_ops = sw_perf_counter_init(counter);
+	if (!hw_ops) {
+		hw_ops = hw_perf_counter_init(counter);
+	}
+
 	if (!hw_ops) {
 		kfree(counter);
 		return NULL;
@@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open(
 			goto err_put_context;
 	}
 
-	ret = -ENOMEM;
+	ret = -EINVAL;
 	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
-- 
cgit v1.2.3-70-g09d2


From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:45:51 +0100
Subject: perf counters: consolidate hw_perf save/restore APIs

Impact: cleanup

Rename them to better match up the usual IRQ disable/enable APIs:

 hw_perf_disable_all()  => hw_perf_save_disable()
 hw_perf_restore_ctrl() => hw_perf_restore()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  8 ++++----
 drivers/acpi/processor_idle.c      | 10 +++++-----
 include/linux/perf_counter.h       | 10 +++++-----
 kernel/perf_counter.c              | 16 ++++++++--------
 4 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 43c8e9a38b4..3e1dbebe22b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -118,13 +118,13 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore_ctrl(u64 ctrl)
+void hw_perf_restore(u64 ctrl)
 {
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
-EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-u64 hw_perf_disable_all(void)
+u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
@@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	return ctrl;
 }
-EXPORT_SYMBOL_GPL(hw_perf_disable_all);
+EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static inline void
 __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index cca804e6f1d..a3e66a33b7a 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,11 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
-	u64 pctrl;
+	u64 perf_flags;
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(perf_flags);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	pctrl = hw_perf_save_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9a1713a1be2..68f6e3ad531 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -67,7 +67,7 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	u64			type;
+	s64			type;
 
 	u64			irq_period;
 	u32			record_type;
@@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
-extern void hw_perf_restore_ctrl(u64 ctrl);
-extern u64 hw_perf_disable_all(void);
+extern u64 hw_perf_save_disable(void);
+extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
 
@@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		      { return 0; }
+static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline u64 hw_perf_save_disable(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 506286e5ba6..0e93fea1712 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return ERR_PTR(-EINVAL);
 }
 
-u64 __weak hw_perf_disable_all(void)		{ return 0; }
-void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+u64 __weak hw_perf_save_disable(void)		{ return 0; }
+void __weak hw_perf_restore(u64 ctrl)	{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
@@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_add_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
-- 
cgit v1.2.3-70-g09d2


From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 15:17:03 +0100
Subject: perf counters: clean up state transitions

Impact: cleanup

Introduce a proper enum for the 3 states of a counter:

	PERF_COUNTER_STATE_OFF		= -1
	PERF_COUNTER_STATE_INACTIVE	=  0
	PERF_COUNTER_STATE_ACTIVE	=  1

and rename counter->active to counter->state and propagate the
changes everywhere.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 include/linux/perf_counter.h       | 11 ++++++++++-
 kernel/perf_counter.c              | 29 ++++++++++++++---------------
 3 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3e1dbebe22b..4854cca7fff 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Then store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (!counter->active) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 97d86c293ee..8cb095fa442 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,6 +127,15 @@ struct hw_perf_counter_ops {
 	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };
 
+/**
+ * enum perf_counter_active_state - the states of a counter
+ */
+enum perf_counter_active_state {
+	PERF_COUNTER_STATE_OFF		= -1,
+	PERF_COUNTER_STATE_INACTIVE	=  0,
+	PERF_COUNTER_STATE_ACTIVE	=  1,
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -136,7 +145,7 @@ struct perf_counter {
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
-	int				active;
+	enum perf_counter_active_state	state;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4e679b91d8b..559130b8774 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info)
 
 	spin_lock(&ctx->lock);
 
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
 		counter->task = NULL;
@@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info)
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
 		counter->hw_ops->hw_perf_counter_enable(counter);
-		counter->active = 1;
+		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
@@ -328,7 +328,6 @@ retry:
 
 	spin_lock_irq(&ctx->lock);
 	/*
-	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
 	if (ctx->nr_active && list_empty(&counter->list_entry)) {
@@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_counter_context *ctx)
 {
-	if (!counter->active)
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
 	counter->hw_ops->hw_perf_counter_disable(counter);
-	counter->active	=  0;
-	counter->oncpu	= -1;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->oncpu = -1;
 
 	cpuctx->active_oncpu--;
 	ctx->nr_active--;
@@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->active == -1)
+	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
 	counter->hw_ops->hw_perf_counter_enable(counter);
-	counter->active = 1;
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
@@ -506,8 +505,8 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->active == 1);
-		counter->active = -1;
+		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -540,9 +539,9 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->active != -1)
+		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 * If counter is enabled and currently active on a CPU, update the
 	 * value in the counter structure:
 	 */
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __hw_perf_counter_read, counter, 1);
 	}
@@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
 
 retry:
 	spin_lock_irq(&ctx->lock);
-	if (!counter->active) {
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 		counter->irqdata = counter->usrdata;
 		counter->usrdata = oldirqdata;
 		spin_unlock_irq(&ctx->lock);
-- 
cgit v1.2.3-70-g09d2


From 9b194e831fb2c322ed81a373e49620f34edc2778 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 20:22:35 +0100
Subject: x86: implement atomic64_t on 32-bit

Impact: new API

Implement the atomic64_t APIs on 32-bit as well. Will be used by
the performance counters code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/atomic_32.h | 218 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6ecdd..9927e01b03c 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+/* An 64bit atomic type */
+
+typedef struct {
+	unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val)	{ (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr)		((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+	asm volatile(
+
+		LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+		     :		"=A" (old)
+
+		     : [ptr]	"D" (ptr),
+				"A" (old),
+				"b" (ll_low(new)),
+				"c" (ll_high(new))
+
+		     : "memory");
+
+	return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+		 unsigned long long new_val)
+{
+	return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	unsigned long long old_val;
+
+	do {
+		old_val = atomic_read(ptr);
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+	unsigned long long curr_val;
+
+	do {
+		curr_val = __atomic64_read(ptr);
+	} while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+	return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val, new_val;
+
+	do {
+		old_val = atomic_read(ptr);
+		new_val = old_val + delta;
+
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+	return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+	return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+	return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+	return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+	atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+	atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+	long long old_val = atomic64_add_return(delta, ptr);
+
+	return old_val < 0;
+}
+
 #include <asm-generic/atomic.h>
 #endif /* _ASM_X86_ATOMIC_32_H */
-- 
cgit v1.2.3-70-g09d2


From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Dec 2008 09:00:03 +0100
Subject: perfcounters: restructure x86 counter math

Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   2 +-
 arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++-----------------
 include/linux/perf_counter.h       |  15 ++-
 kernel/perf_counter.c              |  68 +----------
 4 files changed, 137 insertions(+), 178 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc186724..fe94490bab6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8df72b..5afae13d8d5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
 
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+			struct hw_perf_counter *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count, delta;
+
+	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+	/*
+	 * Careful: an NMI might modify the previous counter value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count, so we do that by clipping the delta to 32 bits:
+	 */
+	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	WARN_ON_ONCE((int)delta < 0);
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if (!hwc->irq_period)
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count	= -(s32)hwc->irq_period;
+	atomic64_set(&hwc->period_left, hwc->irq_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	int err;
+
+	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+	WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+			     struct hw_perf_counter *hwc, int idx)
 {
-	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+	s32 left = atomic64_read(&hwc->period_left);
+	s32 period = hwc->irq_period;
+
+	WARN_ON_ONCE(period <= 0);
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+	}
 
-	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	WARN_ON_ONCE(left <= 0);
+
+	per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw counter starts counting from this counter offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+	wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
-	__hw_perf_counter_set_period(hwc, idx);
-	__x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-				   struct hw_perf_counter *hwc, int idx)
-{
-	s64 raw = -1;
-	s64 delta;
-
-	/*
-	 * Get the raw hw counter value:
-	 */
-	rdmsrl(hwc->counter_base + idx, raw);
-
-	/*
-	 * Rebase it to zero (it started counting at -irq_period),
-	 * to see the delta since ->prev_count:
-	 */
-	delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-	atomic64_counter_set(counter, hwc->prev_count + delta);
-
-	/*
-	 * Adjust the ->prev_count offset - if we went beyond
-	 * irq_period of units, then we got an IRQ and the counter
-	 * was set back to -irq_period:
-	 */
-	while (delta >= (s64)hwc->irq_period) {
-		hwc->prev_count += hwc->irq_period;
-		delta -= (s64)hwc->irq_period;
-	}
-
-	/*
-	 * Calculate the next raw counter value we'll write into
-	 * the counter at the next sched-in time:
-	 */
-	delta -= (s64)hwc->irq_period;
-
-	hwc->next_count = (s32)delta;
+	__hw_perf_counter_set_period(counter, hwc, idx);
+	__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
 	if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-		next_count = per_cpu(prev_next_count[idx], cpu);
+		prev_left = per_cpu(prev_left[idx], cpu);
 
 		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
 		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-			cpu, idx, next_count);
+		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
 	}
 	local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
-	__hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned long addr = hwc->counter_base + hwc->idx;
-	s64 offs, val = -1LL;
-	s32 val32;
-
-	/* Careful: NMI might modify the counter offset */
-	do {
-		offs = hwc->prev_count;
-		rdmsrl(addr, val);
-	} while (offs != hwc->prev_count);
-
-	val32 = (s32) val;
-	val = (s64)hwc->irq_period + (s64)val32;
-	atomic64_counter_set(counter, hwc->prev_count + val);
+	/*
+	 * Drain the remaining delta count out of a counter
+	 * that we are disabling:
+	 */
+	x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
 	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-	__hw_perf_save_counter(counter, hwc, idx);
-	__hw_perf_counter_set_period(hwc, idx);
+	x86_perf_counter_update(counter, hwc, idx);
+	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
-	int bit;
-
-	/*
-	 * Store the counter's own timestamp first:
-	 */
-	perf_store_irq_data(sibling, sibling->hw_event.type);
-	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
 	/*
-	 * Then store sibling timestamps (if any):
+	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-			/*
-			 * When counter was not in the overflow mask, we have to
-			 * read it from hardware. We read it as well, when it
-			 * has not been read yet and clear the bit in the
-			 * status mask.
-			 */
-			bit = counter->hw.idx;
-			if (!test_bit(bit, (unsigned long *) overflown) ||
-			    test_bit(bit, (unsigned long *) status)) {
-				clear_bit(bit, (unsigned long *) status);
-				perf_save_and_restart(counter);
-			}
-		}
+		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
-		perf_store_irq_data(sibling, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8cb095fa442..72460289c65 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
  * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	u64				config;
 	unsigned long			config_base;
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
-	u64				prev_count;
+	atomic64_t			prev_count;
 	u64				irq_period;
-	s32				next_count;
+	atomic64_t			period_left;
+#endif
 };
 
 /*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
-#if BITS_PER_LONG == 64
 	atomic64_t			count;
-#else
-	atomic_t			count32[2];
-#endif
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -172,6 +172,7 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+#endif
 };
 
 /**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 559130b8774..416861ce8b2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)	{ }
+void __weak hw_perf_restore(u64 ctrl)		{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	u32 cntl, cnth;
-
-	local_irq_disable();
-	do {
-		cnth = atomic_read(&counter->count32[1]);
-		cntl = atomic_read(&counter->count32[0]);
-	} while (cnth != atomic_read(&counter->count32[1]));
-
-	local_irq_enable();
-
-	return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
+		counter->hw_ops->hw_perf_counter_enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_counter_read_safe(counter);
+	return atomic64_read(&counter->count);
 }
 
 /*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
 
-	atomic64_counter_set(counter, cpu_clock(cpu));
+	atomic64_set(&counter->count, cpu_clock(cpu));
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+	atomic64_set(&counter->count, current->se.sum_exec_runtime);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v1.2.3-70-g09d2


From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 18:36:30 +0100
Subject: perfcounters: fix non-intel-perfmon CPUs

Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5afae13d8d5..6d30f603b62 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 void hw_perf_enable_all(void)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
@@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+
 	return ctrl;
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 void hw_perf_restore(u64 ctrl)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
-- 
cgit v1.2.3-70-g09d2


From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 21:58:46 +0100
Subject: perfcounters: fix lapic initialization

Fix non-working NMI sampling in certain bootup scenarios.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6d30f603b62..8a154bd7ba9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
 
+	perf_counters_initialized = true;
+
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-
-	perf_counters_initialized = true;
 }
 
 static void x86_perf_counter_read(struct perf_counter *counter)
-- 
cgit v1.2.3-70-g09d2


From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001
From: Jaswinder Singh <jaswinder@infradead.org>
Date: Fri, 19 Dec 2008 22:37:58 +0530
Subject: x86: perf_counter.c intel_perfmon_event_map and
 max_intel_perfmon_events should be static

Impact: cleanup, avoid sparse warnings, reduce kernel size a bit

Fixes these sparse warnings:
 arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh <jaswinder@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8a154bd7ba9..bdbdb56eaa3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,7 +41,7 @@ struct cpu_hw_counters {
  */
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
-const int intel_perfmon_event_map[] =
+static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CYCLES]			= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
 /*
  * Propagate counter elapsed time into the generic counter.
-- 
cgit v1.2.3-70-g09d2


From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:04:16 +0100
Subject: perfcounters: remove warnings

Impact: remove debug checks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 -------
 include/linux/perf_counter.h       | 4 ----
 kernel/perf_counter.c              | 8 --------
 3 files changed, 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index bdbdb56eaa3..89fad5d4fb3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter,
 {
 	u64 prev_raw_count, new_raw_count, delta;
 
-	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
 	/*
 	 * Careful: an NMI might modify the previous counter value.
 	 *
@@ -89,7 +88,6 @@ again:
 	 * of the count, so we do that by clipping the delta to 32 bits:
 	 */
 	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
-	WARN_ON_ONCE((int)delta < 0);
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
@@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	int err;
 
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
-	WARN_ON_ONCE(err);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
@@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	s32 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
 
-	WARN_ON_ONCE(period <= 0);
-
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
 	 */
@@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 		atomic64_set(&hwc->period_left, left);
 	}
 
-	WARN_ON_ONCE(left <= 0);
-
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f30486fc55d..d038450de87 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -218,8 +218,6 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern void
-perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
@@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
-perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
-static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5431e790b5d..aab6c123b02 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
-- 
cgit v1.2.3-70-g09d2


From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:02:19 +0100
Subject: x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h

Impact: rename include file

We'll be providing an asm/perf_counter.h to the generic perfcounter code,
so use the already existing x86 file for this purpose and rename it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/intel_arch_perfmon.h | 41 -------------------------------
 arch/x86/include/asm/perf_counter.h       | 41 +++++++++++++++++++++++++++++++
 arch/x86/kernel/apic.c                    |  2 +-
 arch/x86/kernel/cpu/common.c              |  2 +-
 arch/x86/kernel/cpu/perf_counter.c        |  2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c    |  2 +-
 arch/x86/oprofile/op_model_ppro.c         |  2 +-
 7 files changed, 46 insertions(+), 46 deletions(-)
 delete mode 100644 arch/x86/include/asm/intel_arch_perfmon.h
 create mode 100644 arch/x86/include/asm/perf_counter.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index 71598a9eab6..00000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
-
-union cpuid10_eax {
-	struct {
-		unsigned int version_id:8;
-		unsigned int num_counters:8;
-		unsigned int bit_width:8;
-		unsigned int mask_length:8;
-	} split;
-	unsigned int full;
-};
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
-#else
-static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
-#endif
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 00000000000..9dadce1124e
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
+
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_counters:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 0579ec1cd6e..4f859acb156 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,7 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4461011db47..ad331b4d623 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,7 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 89fad5d4fb3..a4a3a09a654 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,7 +16,7 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b2267..d6f5b9fbde3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 struct nmi_watchdog_ctlblk {
 	unsigned int cccr_msr;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf..07c914555a5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
-- 
cgit v1.2.3-70-g09d2


From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:09:13 +0100
Subject: x86, perfcounters: prepare for fixed-mode PMCs

Impact: refactor the x86 code for fixed-mode PMCs

Extend the data structures and rename the existing facilities
to allow for a 'generic' versus 'fixed' counter distinction.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 11 ++++++++
 arch/x86/kernel/cpu/perf_counter.c  | 53 ++++++++++++++++++-------------------
 include/linux/perf_counter.h        |  1 +
 3 files changed, 38 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 9dadce1124e..dd5a4a559e2 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_PERF_COUNTER_H
 #define _ASM_X86_PERF_COUNTER_H
 
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -20,6 +27,10 @@
 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a4a3a09a654..fc3af868823 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
-/* No support for fixed function counters yet */
-
-#define MAX_HW_COUNTERS		8
-
 struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HW_COUNTERS];
-	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
+
+	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
+	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
 };
 
 /*
@@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl)
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline void
-__x86_perf_counter_disable(struct perf_counter *counter,
+__pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
@@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static void
-__x86_perf_counter_enable(struct perf_counter *counter,
+__pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
@@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter,
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void x86_perf_counter_enable(struct perf_counter *counter)
+static void pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->counters[idx] = counter;
+	cpuc->generic[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__x86_perf_counter_enable(counter, hwc, idx);
+	__pmc_generic_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
@@ -301,16 +300,16 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void x86_perf_counter_disable(struct perf_counter *counter)
+static void pmc_generic_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
+	cpuc->generic[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(counter, hwc, idx);
+		__pmc_generic_enable(counter, hwc, idx);
 }
 
 static void
@@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -412,7 +411,7 @@ again:
 		}
 		/*
 		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these counters as
+		 * do a task wakeup - but we mark these generic as
 		 * wakeup_pending and initate a wakeup callback:
 		 */
 		if (nmi) {
@@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		if (!counter)
 			continue;
@@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > MAX_HW_COUNTERS) {
-		nr_hw_counters = MAX_HW_COUNTERS;
+	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
+		nr_hw_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, MAX_HW_COUNTERS);
+			nr_hw_counters, X86_PMC_MAX_GENERIC);
 	}
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
@@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_perf_counter_read(struct perf_counter *counter)
+static void pmc_generic_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= x86_perf_counter_enable,
-	.hw_perf_counter_disable	= x86_perf_counter_disable,
-	.hw_perf_counter_read		= x86_perf_counter_read,
+	.hw_perf_counter_enable		= pmc_generic_enable,
+	.hw_perf_counter_disable	= pmc_generic_disable,
+	.hw_perf_counter_read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d038450de87..984da540224 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/perf_counter.h>
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3-70-g09d2


From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 10:51:15 +0100
Subject: perfcounters: add fixed-mode PMC enumeration

Enumerate fixed-mode PMCs based on CPUID, and feed that into the
perfcounter code.

Does not use fixed-mode PMCs yet.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 23 +++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_counter.c  | 23 +++++++++++++++++------
 2 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index dd5a4a559e2..945a315e6d6 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -41,6 +41,29 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+union cpuid10_edx {
+	struct {
+		unsigned int num_counters_fixed:4;
+		unsigned int reserved:28;
+	} split;
+	unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc3af868823..2fca50c4597 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
+static int nr_hw_counters_fixed __read_mostly;
+
 struct cpu_hw_counters {
 	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
@@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 void __init init_hw_perf_counters(void)
 {
 	union cpuid10_eax eax;
-	unsigned int unused;
 	unsigned int ebx;
+	unsigned int unused;
+	union cpuid10_edx edx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void)
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
 
-	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
 	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
 		nr_hw_counters = X86_PMC_MAX_GENERIC;
@@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
 
-	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
+	nr_hw_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+	}
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
 
 	perf_counters_initialized = true;
 
-- 
cgit v1.2.3-70-g09d2


From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 13:09:20 +0100
Subject: x86, perfcounters: refactor code for fixed-function PMCs

Impact: clean up

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 14 ++++++-
 arch/x86/kernel/cpu/perf_counter.c  | 73 ++++++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 945a315e6d6..13745deb16c 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -8,6 +8,10 @@
 #define X86_PMC_MAX_GENERIC					8
 #define X86_PMC_MAX_FIXED					3
 
+#define X86_PMC_IDX_GENERIC				        0
+#define X86_PMC_IDX_FIXED				       32
+#define X86_PMC_IDX_MAX					       64
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -54,6 +58,15 @@ union cpuid10_edx {
  * Fixed-purpose performance counters:
  */
 
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
 
@@ -63,7 +76,6 @@ union cpuid10_edx {
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2fca50c4597..358af526640 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly;
 /*
  * Number of (generic) HW counters:
  */
-static int nr_hw_counters __read_mostly;
-static u32 perf_counter_mask __read_mostly;
+static int nr_counters_generic __read_mostly;
+static u64 perf_counter_mask __read_mostly;
 
-static int nr_hw_counters_fixed __read_mostly;
+static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
-	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
-
-	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
-	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
+	struct perf_counter	*counters[X86_PMC_IDX_MAX];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
 /*
@@ -159,7 +156,7 @@ void hw_perf_enable_all(void)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
 }
 
 u64 hw_perf_save_disable(void)
@@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void)
 		return 0;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
@@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+static int fixed_mode_idx(struct hw_perf_counter *hwc)
+{
+	return -1;
+}
+
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
@@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->generic[idx] = counter;
+	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -270,7 +272,7 @@ void perf_counter_print_debug(void)
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
-	if (!nr_hw_counters)
+	if (!nr_counters_generic)
 		return;
 
 	local_irq_disable();
@@ -286,7 +288,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
-	for (idx = 0; idx < nr_hw_counters; idx++) {
+	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
@@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->generic[idx] = NULL;
+	cpuc->counters[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -424,7 +426,7 @@ again:
 		}
 	}
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -436,7 +438,7 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off:
 	 */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
@@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		if (!counter)
 			continue;
@@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void)
 
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
-	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
-		nr_hw_counters = X86_PMC_MAX_GENERIC;
+	nr_counters_generic = eax.split.num_counters;
+	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
+		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, X86_PMC_MAX_GENERIC);
+			nr_counters_generic, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_hw_counters) - 1;
-	perf_max_counters = nr_hw_counters;
+	perf_counter_mask = (1 << nr_counters_generic) - 1;
+	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
-	nr_hw_counters_fixed = edx.split.num_counters_fixed;
-	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+
+	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
+	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
cgit v1.2.3-70-g09d2


From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:20:28 +0100
Subject: perfcounters: hw ops rename

Impact: rename field names

Shorten them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  6 ++---
 include/linux/perf_counter.h       |  6 ++---
 kernel/perf_counter.c              | 50 +++++++++++++++++++-------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 358af526640..b6755712142 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= pmc_generic_enable,
-	.hw_perf_counter_disable	= pmc_generic_disable,
-	.hw_perf_counter_read		= pmc_generic_read,
+	.enable		= pmc_generic_enable,
+	.disable	= pmc_generic_disable,
+	.read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 984da540224..48f76d2e54c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,9 +128,9 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+	void (*enable)			(struct perf_counter *counter);
+	void (*disable)			(struct perf_counter *counter);
+	void (*read)			(struct perf_counter *counter);
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8a4d9a5d5d..961d651aa57 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->hw_ops->hw_perf_counter_disable(counter);
+		counter->hw_ops->disable(counter);
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info)
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
-		counter->hw_ops->hw_perf_counter_enable(counter);
+		counter->hw_ops->enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
-	counter->hw_ops->hw_perf_counter_disable(counter);
+	counter->hw_ops->disable(counter);
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->oncpu = -1;
 
@@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter,
  *
  * We stop each counter and update the counter value in counter->count.
  *
- * This does not protect us against NMI, but hw_perf_counter_disable()
+ * This does not protect us against NMI, but disable()
  * sets the disabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
@@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
-	counter->hw_ops->hw_perf_counter_enable(counter);
+	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter,
  *
  * We restore the counter value and then enable it.
  *
- * This does not protect us against NMI, but hw_perf_counter_enable()
+ * This does not protect us against NMI, but enable()
  * sets the enabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * keep the counter running.
@@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 /*
  * Cross CPU call to read the hardware counter
  */
-static void __hw_perf_counter_read(void *info)
+static void __read(void *info)
 {
 	struct perf_counter *counter = info;
 
-	counter->hw_ops->hw_perf_counter_read(counter);
+	counter->hw_ops->read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
-					 __hw_perf_counter_read, counter, 1);
+					 __read, counter, 1);
 	}
 
 	return atomic64_read(&counter->count);
@@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
-	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+	.enable		= cpu_clock_perf_counter_enable,
+	.disable	= cpu_clock_perf_counter_disable,
+	.read		= cpu_clock_perf_counter_read,
 };
 
 static void task_clock_perf_counter_update(struct perf_counter *counter)
@@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
-	.hw_perf_counter_read		= task_clock_perf_counter_read,
+	.enable		= task_clock_perf_counter_enable,
+	.disable	= task_clock_perf_counter_disable,
+	.read		= task_clock_perf_counter_read,
 };
 
 static u64 get_page_faults(void)
@@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.hw_perf_counter_enable		= page_faults_perf_counter_enable,
-	.hw_perf_counter_disable	= page_faults_perf_counter_disable,
-	.hw_perf_counter_read		= page_faults_perf_counter_read,
+	.enable		= page_faults_perf_counter_enable,
+	.disable	= page_faults_perf_counter_disable,
+	.read		= page_faults_perf_counter_read,
 };
 
 static u64 get_context_switches(void)
@@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
-	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
-	.hw_perf_counter_read		= context_switches_perf_counter_read,
+	.enable		= context_switches_perf_counter_enable,
+	.disable	= context_switches_perf_counter_disable,
+	.read		= context_switches_perf_counter_read,
 };
 
 static inline u64 get_cpu_migrations(void)
@@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
-	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+	.enable		= cpu_migrations_perf_counter_enable,
+	.disable	= cpu_migrations_perf_counter_disable,
+	.read		= cpu_migrations_perf_counter_read,
 };
 
 static const struct hw_perf_counter_ops *
@@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+		child_counter->hw_ops->disable(child_counter);
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
 		child_counter->oncpu = -1;
 
-- 
cgit v1.2.3-70-g09d2


From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 13:50:42 +0100
Subject: perfcounters: enable lowlevel pmc code to schedule counters

Allow lowlevel ->enable() op to return an error if a counter can not be
added. This can be used to handle counter constraints.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  6 +++-
 include/linux/perf_counter.h       |  2 +-
 kernel/perf_counter.c              | 62 +++++++++++++++++++++++++++-----------
 3 files changed, 51 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b6755712142..74090a393a7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void pmc_generic_enable(struct perf_counter *counter)
+static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+		if (idx == nr_counters_generic)
+			return -EAGAIN;
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
+
+	return 0;
 }
 
 void perf_counter_print_debug(void)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48f76d2e54c..53af11d3767 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,7 +128,7 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*enable)			(struct perf_counter *counter);
+	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f1110ac1267..2e73929a695 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = NULL;
 }
 
-static void
+static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
 	if (counter->state == PERF_COUNTER_STATE_OFF)
-		return;
+		return 0;
+
+	if (counter->hw_ops->enable(counter))
+		return -EAGAIN;
 
-	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
 	ctx->nr_active++;
+
+	return 0;
 }
 
 static int
@@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter,
 	       struct perf_counter_context *ctx,
 	       int cpu)
 {
-	struct perf_counter *counter;
-	int was_group = 0;
+	struct perf_counter *counter, *partial_group;
+	int ret = 0;
 
-	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+		return -EAGAIN;
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter_sched_in(counter, cpuctx, ctx, cpu);
-		was_group = 1;
+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+			partial_group = counter;
+			goto group_error;
+		}
+		ret = -EAGAIN;
 	}
 
-	return was_group;
+	return ret;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		if (counter == partial_group)
+			break;
+		counter_sched_out(counter, cpuctx, ctx);
+	}
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	return -EAGAIN;
 }
 
 /*
@@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (ctx->nr_active == cpuctx->max_pertask)
-			break;
-
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
@@ -856,8 +875,9 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
-static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
+	return 0;
 }
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
@@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, now);
 }
 
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	u64 now = task_clock_perf_counter_val(counter, 0);
 
 	atomic64_set(&counter->hw.prev_count, now);
+
+	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
@@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 	page_faults_perf_counter_update(counter);
 }
 
-static void page_faults_perf_counter_enable(struct perf_counter *counter)
+static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * page-faults is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void page_faults_perf_counter_disable(struct perf_counter *counter)
@@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 	context_switches_perf_counter_update(counter);
 }
 
-static void context_switches_perf_counter_enable(struct perf_counter *counter)
+static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * ->nvcsw + curr->nivcsw is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void context_switches_perf_counter_disable(struct perf_counter *counter)
@@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * se.nr_migrations is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-- 
cgit v1.2.3-70-g09d2


From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:28:12 +0100
Subject: x86, perfcounters: print out the ->used bitmask

Impact: extend debug printouts

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 74090a393a7..f3359c2b391 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		if (idx == nr_counters_generic)
 			return -EAGAIN;
+
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
 	if (!nr_counters_generic)
@@ -282,6 +284,7 @@ void perf_counter_print_debug(void)
 	local_irq_disable();
 
 	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -291,6 +294,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-- 
cgit v1.2.3-70-g09d2


From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:17:29 +0100
Subject: perfcounters: add PERF_COUNT_BUS_CYCLES

Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref
on x86. (which is a good enough approximation)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 ++-
 include/linux/perf_counter.h       | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f3359c2b391..86b2fdd344a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 static const int intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
   [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
   [PERF_COUNT_CACHE_MISSES]		= 0x412e,
   [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
 static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1ea08e9f31c..ec77d1643d3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -36,14 +36,15 @@ enum hw_event_types {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_CPU_CYCLES		=  0,
 	PERF_COUNT_INSTRUCTIONS		=  1,
 	PERF_COUNT_CACHE_REFERENCES	=  2,
 	PERF_COUNT_CACHE_MISSES		=  3,
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  6,
+	PERF_HW_EVENTS_MAX		=  7,
 
 	/*
 	 * Special "software" counters provided by the kernel, even if
-- 
cgit v1.2.3-70-g09d2


From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Dec 2008 11:10:42 +0100
Subject: x86, perfcounters: add support for fixed-function pmcs

Impact: extend performance counter support on x86 Intel CPUs

Modern Intel CPUs have 3 "fixed-function" performance counters, which
count these hardware events:

    Instr_Retired.Any
    CPU_CLK_Unhalted.Core
    CPU_CLK_Unhalted.Ref

Add support for them to the performance counters subsystem.

Their use is transparent to user-space: the counter scheduler is
extended to automatically recognize the cases where a fixed-function
PMC can be utilized instead of a generic PMC. In such cases the
generic PMC is kept available for more counters.

The above fixed-function events map to these generic counter hw events:

        PERF_COUNT_INSTRUCTIONS
        PERF_COUNT_CPU_CYCLES
        PERF_COUNT_BUS_CYCLES

(The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed
 frequency.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |   8 ++
 arch/x86/kernel/cpu/perf_counter.c  | 149 ++++++++++++++++++++++++++++++------
 2 files changed, 133 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 13745deb16c..2e08ed73664 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -23,6 +23,11 @@
 #define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
 #define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK				    0xffff
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
@@ -69,12 +74,15 @@ union cpuid10_edx {
 
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
 
 /* CPU_CLK_Unhalted.Core: */
 #define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
 
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 86b2fdd344a..da46eca1254 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly;
  */
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
+static u64 counter_value_mask __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
-
 	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
@@ -183,16 +181,34 @@ void hw_perf_restore(u64 ctrl)
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static inline void
+__pmc_fixed_disable(struct perf_counter *counter,
+		    struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+	int err;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
 static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
 
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_disable(counter, hwc, idx);
+
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -202,8 +218,9 @@ static void
 __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
-	s32 left = atomic64_read(&hwc->period_left);
+	s64 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
+	int err;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	 * The hw counter starts counting from this counter offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsr(hwc->counter_base + idx, -left, 0);
+	err = checking_wrmsrl(hwc->counter_base + idx,
+			     (u64)(-left) & counter_value_mask);
+}
+
+static inline void
+__pmc_fixed_enable(struct perf_counter *counter,
+		   struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+	int err;
+
+	/*
+	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
+	 * and enable ring-0 counting if allowed:
+	 */
+	bits = 0x8ULL | 0x2ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void
 __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_enable(counter, hwc, idx);
+
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-static int fixed_mode_idx(struct hw_perf_counter *hwc)
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
+	unsigned int event;
+
+	if (unlikely(hwc->nmi))
+		return -1;
+
+	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
 	return -1;
 }
 
@@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
+	int idx;
 
-	/* Try to get the previous counter again */
-	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-		if (idx == nr_counters_generic)
-			return -EAGAIN;
+	idx = fixed_mode_idx(counter, hwc);
+	if (idx >= 0) {
+		/*
+		 * Try to get the fixed counter, if that is already taken
+		 * then try to get a generic counter:
+		 */
+		if (test_and_set_bit(idx, cpuc->used))
+			goto try_generic;
 
-		set_bit(idx, cpuc->used);
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		/*
+		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+		 */
+		hwc->counter_base =
+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
 		hwc->idx = idx;
+	} else {
+		idx = hwc->idx;
+		/* Try to get the previous generic counter again */
+		if (test_and_set_bit(idx, cpuc->used)) {
+try_generic:
+			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+			if (idx == nr_counters_generic)
+				return -EAGAIN;
+
+			set_bit(idx, cpuc->used);
+			hwc->idx = idx;
+		}
+		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
+		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
+	/*
+	 * Make it visible before enabling the hw:
+	 */
+	smp_wmb();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
@@ -290,11 +377,13 @@ void perf_counter_print_debug(void)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -303,13 +392,19 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
+	for (idx = 0; idx < nr_counters_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
 	local_irq_enable();
 }
 
@@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter)
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
+	/*
+	 * Make sure the cleared pointer becomes visible before we
+	 * (potentially) free the counter:
+	 */
+	smp_wmb();
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
-	u64 pmc_ctrl;
-
-	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	x86_perf_counter_update(counter, hwc, idx);
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
-	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
@@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
@@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
@@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_fixed = edx.split.num_counters_fixed;
-- 
cgit v1.2.3-70-g09d2


From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Sat, 27 Dec 2008 19:15:43 +0530
Subject: x86: perf_counter remove unwanted hw_perf_enable_all

Impact: clean, reduce kernel size a bit, avoid sparse warnings

Fixes sparse warnings:

 arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da46eca1254..9376771f757 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-void hw_perf_enable_all(void)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
-}
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -200,12 +192,10 @@ static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	int err;
-
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_disable(counter, hwc, idx);
-
-	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		__pmc_fixed_disable(counter, hwc, idx);
+	else
+		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_enable(counter, hwc, idx);
-
-	wrmsr(hwc->config_base + idx,
-	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		__pmc_fixed_enable(counter, hwc, idx);
+	else
+		wrmsr(hwc->config_base + idx,
+		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 static int
-- 
cgit v1.2.3-70-g09d2


From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 10:13:01 +0100
Subject: perfcounters: throttle on too high IRQ rates

Starting kerneltop with only -c 100 seems to be a bad idea, it can
easily lock the system due to perfcounter IRQ overload.

So add throttling: if a new IRQ arrives in a shorter than
PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them
from the next timer tick.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic.c             |  2 ++
 arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------
 include/linux/perf_counter.h       |  4 ++++
 3 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 7b434e5b14c..849c23009bf 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
+
+	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9376771f757..1a040b179b5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	u64			last_interrupt;
+	u64			global_enable;
+	int			throttled;
 };
 
 /*
@@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, saved_global;
-	struct cpu_hw_counters *cpuc;
+	u64 ack, status, now;
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 
 	/* Disable counters globally */
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	now = sched_clock();
+	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
+		cpuc->throttled = 1;
+	cpuc->last_interrupt = now;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
@@ -533,9 +539,29 @@ again:
 		goto again;
 out:
 	/*
-	 * Restore - do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	if (!cpuc->throttled)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+}
+
+void perf_counter_unthrottle(void)
+{
+	struct cpu_hw_counters *cpuc;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	if (cpuc->throttled) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		cpuc->throttled = 0;
+	}
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 33ba9fe0a78..91f1ca4c01c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
@@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+#define PERFMON_MIN_PERIOD_NS 10000
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
cgit v1.2.3-70-g09d2


From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:36:16 +0100
Subject: perfcounters: ratelimit performance counter interrupts

Ratelimit performance counter interrupts to 100KHz per CPU.

This replaces the irq-delta-time based method.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------
 include/linux/perf_counter.h       |  2 --
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1a040b179b5..a56d4cf92f3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	u64			last_interrupt;
+	unsigned long		interrupts;
 	u64			global_enable;
-	int			throttled;
 };
 
 /*
@@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	}
 }
 
+/*
+ * Maximum interrupt frequency of 100KHz per CPU
+ */
+#define PERFMON_MAX_INTERRUPTS 100000/HZ
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, now;
+	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
@@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	now = sched_clock();
-	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
-		cpuc->throttled = 1;
-	cpuc->last_interrupt = now;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
 		goto out;
@@ -541,13 +540,14 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (!cpuc->throttled)
+	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
+	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -556,12 +556,15 @@ void perf_counter_unthrottle(void)
 		return;
 
 	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
-	if (cpuc->throttled) {
+	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-		cpuc->throttled = 0;
 	}
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
+	if (unlikely(cpuc->global_enable && !global_enable))
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 91f1ca4c01c..f55381fbcac 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
-#define PERFMON_MIN_PERIOD_NS 10000
-
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
-- 
cgit v1.2.3-70-g09d2


From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:16:53 +0100
Subject: perfcounters fix section mismatch warning in
 perf_counter.c::perf_counters_lapic_init()

Fix:

WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init()
The function pmc_generic_enable() references
the function __cpuinit perf_counters_lapic_init().
This is often because pmc_generic_enable lacks a __cpuinit
annotation or the annotation of perf_counters_lapic_init is wrong.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a56d4cf92f3..46c436cdd73 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	local_irq_restore(flags);
 }
 
-void __cpuinit perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-- 
cgit v1.2.3-70-g09d2


From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 25 Jan 2009 02:38:09 -0800
Subject: x86: make irqinit_32.c more like irqinit_64.c, v2

Impact: cleanup

1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit
2. move the apic_intr_init() call before set gate with interrupt[i]
3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector.

[ v2: should use !test_bit() instead of test_bit() with 32bit ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 56 ++++++++++++++++++++++++++------------------
 arch/x86/kernel/irqinit_64.c |  7 +++---
 arch/x86/kernel/traps.c      | 15 +++++-------
 3 files changed, 43 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index c56496f8c6f..ddf3eb72f86 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
 {
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-	}
-
-
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -170,8 +150,13 @@ void __init native_init_IRQ(void)
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+}
 
+static void __init apic_intr_init(void)
+{
 #ifdef CONFIG_X86_LOCAL_APIC
+	smp_intr_init();
+
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -181,12 +166,37 @@ void __init native_init_IRQ(void)
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 # endif
-#endif
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+# ifdef CONFIG_X86_MCE_P4THERMAL
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+# endif
 #endif
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	apic_intr_init();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 6a71bfc51e5..16e1fc68750 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -162,6 +162,9 @@ void __init native_init_IRQ(void)
 	int i;
 
 	init_ISA_irqs();
+
+	apic_intr_init();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
@@ -169,12 +172,10 @@ void __init native_init_IRQ(void)
 	 */
 	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
+		if (!test_bit(vector, used_vectors))
 			set_intr_gate(vector, interrupt[i]);
 	}
 
-	apic_intr_init();
-
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ed5aee5f3fc..d36a502d87a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -979,8 +979,13 @@ void __init trap_init(void)
 #endif
 	set_intr_gate(19, &simd_coprocessor_error);
 
+	/* Reserve all the builtin and the syscall vector: */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+		set_bit(i, used_vectors);
+
 #ifdef CONFIG_IA32_EMULATION
 	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_32
@@ -997,17 +1002,9 @@ void __init trap_init(void)
 	}
 
 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
-	/* Reserve all the builtin and the syscall vector: */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-		set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
cgit v1.2.3-70-g09d2


From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 1 Feb 2009 22:07:39 +0530
Subject: x86: irqinit_32.c fix compilation warning

Fix:

  arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index ddf3eb72f86..520e6c1c5d2 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -154,9 +154,9 @@ static void __init smp_intr_init(void)
 
 static void __init apic_intr_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	smp_intr_init();
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
-- 
cgit v1.2.3-70-g09d2


From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 4 Feb 2009 17:11:34 +0100
Subject: perfcounters: fix "perf counters kill oprofile" bug

With oprofile as a module, and unloaded by profiling script,
both oprofile and kerneltop work fine.. unless you leave kerneltop
running when you start profiling, then you may see badness.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 +++-
 arch/x86/oprofile/nmi_int.c        | 7 ++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46c436cdd73..8bb213323fe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler
+	.notifier_call		= perf_counter_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
 };
 
 void __init init_hw_perf_counters(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a..c638685136e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
 
 	switch (val) {
 	case DIE_NMI:
-		if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
-			ret = NOTIFY_STOP;
+	case DIE_NMI_IPI:
+		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		ret = NOTIFY_STOP;
 		break;
 	default:
 		break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
 static struct notifier_block profile_exceptions_nb = {
 	.notifier_call = profile_exceptions_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = 2
 };
 
 static int nmi_setup(void)
-- 
cgit v1.2.3-70-g09d2


From 82aa9a1829199233f9bdaf26e2ee271114f4701e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Feb 2009 15:23:08 +0100
Subject: perfcounters: fix "perf counters kills oprofile" bug, v2

Impact: fix kernel crash

Both oprofile and perfcounters register an NMI die handler, but only one
can handle the NMI.  Conveniently, oprofile unregisters it's notifier
when not actively in use, so setting it's notifier priority higher than
perfcounter's allows oprofile to borrow the NMI for the duration of it's
run.  Tested/works both as module and built-in.

While testing, I found that if kerneltop was generating NMIs at very
high frequency, the kernel may panic when oprofile registered it's
handler.  This turned out to be because oprofile registers it's handler
before reset_value has been allocated, so if an NMI comes in while it's
still setting up, kabOom.  Rather than try more invasive changes, I
followed the lead of other places in op_model_ppro.c, and simply
returned in that highly unlikely event.  (debug warnings attached)

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/oprofile/op_model_ppro.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 07c914555a5..85eb6268374 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 	u64 val;
 	int i;
 
+	/*
+	 * This can happen if perf counters are in use when
+	 * we steal the die notifier NMI.
+	 */
+	if (unlikely(!reset_value))
+		goto out;
+
 	for (i = 0 ; i < num_counters; ++i) {
 		if (!reset_value[i])
 			continue;
@@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 		}
 	}
 
+out:
 	/* Only P6 based Pentium M need to re-unmask the apic vector but it
 	 * doesn't hurt other P6 variant */
 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-- 
cgit v1.2.3-70-g09d2


From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 9 Feb 2009 07:38:50 +0100
Subject: perf_counters: account NMI interrupts

I noticed that kerneltop interrupts were accounted as NMI, but not their
perf counter origin.

Account NMI performance counter interrupts.

Signed-off-by: Mike Galbraith  <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 arch/x86/kernel/cpu/perf_counter.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8bb213323fe..9901e46998d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		goto out;
 
 again:
+	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
@@ -570,7 +571,6 @@ void perf_counter_unthrottle(void)
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
-	inc_irq_stat(apic_perf_irqs);
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	__smp_perf_counter_interrupt(regs, 0);
 
-- 
cgit v1.2.3-70-g09d2


From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 14:35:35 +1100
Subject: perf_counters: allow users to count user, kernel and/or hypervisor
 events

Impact: new perf_counter feature

This extends the perf_counter_hw_event struct with bits that specify
that events in user, kernel and/or hypervisor mode should not be
counted (i.e. should be excluded), and adds code to program the PMU
mode selection bits accordingly on x86 and powerpc.

For software counters, we don't currently have the infrastructure to
distinguish which mode an event occurs in, so we currently fail the
counter initialization if the setting of the hw_event.exclude_* bits
would require us to distinguish.  Context switches and CPU migrations
are currently considered to occur in kernel mode.

On x86, this changes the previous policy that only root can count
kernel events.  Now non-root users can count kernel events or exclude
them.  Non-root users still can't use NMI events, though.  On x86 we
don't appear to have any way to control whether hypervisor events are
counted or not, so hw_event.exclude_hv is ignored.

On powerpc, the selection of whether to count events in user, kernel
and/or hypervisor mode is PMU-wide, not per-counter, so this adds a
check that the hw_event.exclude_* settings are the same as other events
on the PMU.  Counters being added to a group have to have the same
settings as the other hardware counters in the group.  Counters and
groups can only be enabled in hw_perf_group_sched_in or power_perf_enable
if they have the same settings as any other counters already on the
PMU.  If we are not running on a hypervisor, the exclude_hv setting
is ignored (by forcing it to 0) since we can't ever get any
hypervisor events.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++-------
 include/linux/perf_counter.h       | 19 ++++++-----
 kernel/perf_counter.c              | 26 ++++++++++++---
 4 files changed, 117 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5b0211348c7..bd6ba85beb5 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -16,6 +16,7 @@
 #include <asm/reg.h>
 #include <asm/pmc.h>
 #include <asm/machdep.h>
+#include <asm/firmware.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	return 0;
 }
 
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+{
+	int eu, ek, eh;
+	int i, n;
+	struct perf_counter *counter;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	eu = ctrs[0]->hw_event.exclude_user;
+	ek = ctrs[0]->hw_event.exclude_kernel;
+	eh = ctrs[0]->hw_event.exclude_hv;
+	if (n_prev == 0)
+		n_prev = 1;
+	for (i = n_prev; i < n; ++i) {
+		counter = ctrs[i];
+		if (counter->hw_event.exclude_user != eu ||
+		    counter->hw_event.exclude_kernel != ek ||
+		    counter->hw_event.exclude_hv != eh)
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 static void power_perf_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
@@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable)
 		goto out;
 	}
 
+	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * hw_event.exclude_* bits for the first counter.
+	 * We have already checked that all counters have the
+	 * same values for these bits as the first counter.
+	 */
+	counter = cpuhw->counter[0];
+	if (counter->hw_event.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (counter->hw_event.exclude_kernel)
+		cpuhw->mmcr[0] |= MMCR0_FCS;
+	if (counter->hw_event.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
 	/*
 	 * Write the new configuration to MMCR* with the freeze
 	 * bit set and set the hardware counters to their initial values.
@@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			   &cpuhw->counter[n0], &cpuhw->events[n0]);
 	if (n < 0)
 		return -EAGAIN;
+	if (check_excludes(cpuhw->counter, n0, n))
+		return -EAGAIN;
 	if (power_check_constraints(cpuhw->events, n + n0))
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
@@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
+	if (check_excludes(cpuhw->counter, n0, 1))
+		goto out;
 	if (power_check_constraints(cpuhw->events, n0 + 1))
 		goto out;
 
@@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
 
+	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.  This also means that we don't
+	 * set the MMCR0_FCHV bit, which unconditionally freezes
+	 * the counters on the PPC970 variants used in Apple G5
+	 * machines (since MSR.HV is always 1 on those machines).
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		counter->hw_event.exclude_hv = 0;
+	
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		if (n < 0)
 			return NULL;
 	}
-	events[n++] = ev;
-	if (power_check_constraints(events, n))
+	events[n] = ev;
+	if (check_excludes(ctrs, n, 1))
+		return NULL;
+	if (power_check_constraints(events, n + 1))
 		return NULL;
 
-	counter->hw.config = events[n - 1];
+	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 	return &power_perf_ops;
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9901e46998d..383d4c6423a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -EINVAL;
 
 	/*
-	 * Count user events, and generate PMC IRQs:
+	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
-	 * If privileged enough, count OS events too, and allow
-	 * NMI events as well:
+	 * Count user and OS events unless requested not to.
 	 */
-	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN)) {
+	if (!hw_event->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event->nmi)
-			hwc->nmi = 1;
-	}
+
+	/*
+	 * If privileged enough, allow NMI events:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
 	/*
@@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	int err;
 
 	/*
-	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
-	 * and enable ring-0 counting if allowed:
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
 	 */
-	bits = 0x8ULL | 0x2ULL;
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
 	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
 		bits |= 0x1;
 	bits <<= (idx * 4);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f55381fbcac..c83f51d6e35 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,14 +83,17 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default      */
-				nmi	     :  1, /* NMI sampling        */
-				raw	     :  1, /* raw event type      */
-				inherit	     :  1, /* children inherit it */
-				pinned	     :  1, /* must always be on PMU */
-				exclusive    :  1, /* only counter on PMU */
-
-				__reserved_1 : 26;
+	u32			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+
+				__reserved_1 : 23;
 
 	u64			__reserved_2;
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 544193cbc47..89d5e3fe970 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct hw_perf_counter_ops *hw_ops = NULL;
 
+	/*
+	 * Software counters (currently) can't in general distinguish
+	 * between user, kernel and hypervisor events.
+	 * However, context switches and cpu migrations are considered
+	 * to be kernel events, and page faults are never hypervisor
+	 * events.
+	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel ||
+		      counter->hw_event.exclude_hv))
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
+		if (counter->hw_event.exclude_user ||
+		    counter->hw_event.exclude_kernel ||
+		    counter->hw_event.exclude_hv)
+			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		hw_ops = &perf_ops_page_faults;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel))
+			hw_ops = &perf_ops_page_faults;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_context_switches;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_context_switches;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		hw_ops = &perf_ops_cpu_migrations;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
 		break;
-- 
cgit v1.2.3-70-g09d2


From 73ca2f8380311115723c7afe811f3ed1f0ba945e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Feb 2009 01:08:17 +0100
Subject: perfcounters: remove duplicate definition of LOCAL_PERF_VECTOR

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq_vectors.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index b66b518ff00..b07278c55e9 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -111,11 +111,6 @@
  */
 #define LOCAL_PERF_VECTOR		0xee
 
-/*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR	0xee
-
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
-- 
cgit v1.2.3-70-g09d2


From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 18:09:09 +0530
Subject: x86: prepare perf_counter to add more cpus

Introduced  struct pmc_x86_ops to add more cpus.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 383d4c6423a..a3c88529bb7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -3,6 +3,7 @@
  *
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2009 Jaswinder Singh Rajput
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -38,10 +39,24 @@ struct cpu_hw_counters {
 };
 
 /*
- * Intel PerfMon v3. Used on Core2 and later.
+ * struct pmc_x86_ops - performance counter x86 ops
  */
+struct pmc_x86_ops {
+	u64 (*save_disable_all)		(void);
+	void (*restore_all)		(u64 ctrl);
+	unsigned eventsel;
+	unsigned perfctr;
+	int (*event_map)		(int event);
+	int max_events;
+};
+
+static struct pmc_x86_ops *pmc_ops;
+
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
 static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
@@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static int pmc_intel_event_map(int event)
+{
+	return intel_perfmon_event_map[event];
+}
 
 /*
  * Propagate counter elapsed time into the generic counter.
@@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (hw_event->raw) {
 		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event->type >= max_intel_perfmon_events)
+		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event->type];
+		hwc->config |= pmc_ops->event_map(hw_event->type);
 	}
 	counter->wakeup_pending = 0;
 
 	return 0;
 }
 
-u64 hw_perf_save_disable(void)
+static u64 pmc_intel_save_disable_all(void)
 {
 	u64 ctrl;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
+
+u64 hw_perf_save_disable(void)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->save_disable_all();
+}
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+static void pmc_intel_restore_all(u64 ctrl)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	pmc_ops->restore_all(ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -339,8 +367,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
-		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+		hwc->config_base  = pmc_ops->eventsel;
+		hwc->counter_base = pmc_ops->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -386,8 +414,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
+		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
+		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-void __init init_hw_perf_counters(void)
+static struct pmc_x86_ops pmc_intel_ops = {
+	.save_disable_all	= pmc_intel_save_disable_all,
+	.restore_all		= pmc_intel_restore_all,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= pmc_intel_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+};
+
+static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
 	unsigned int ebx;
 	unsigned int unused;
 	union cpuid10_edx edx;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return;
+		return NULL;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
 	nr_counters_generic = eax.split.num_counters;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+
+	return &pmc_intel_ops;
+}
+
+void __init init_hw_perf_counters(void)
+{
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		pmc_ops = pmc_intel_init();
+		break;
+	}
+	if (!pmc_ops)
+		return;
+
+	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
 
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
-
-	nr_counters_fixed = edx.split.num_counters_fixed;
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-- 
cgit v1.2.3-70-g09d2


From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 20:15:14 +0530
Subject: x86: AMD Support for perf_counter

Supported basic performance counter for AMD K7 and later:

$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null

 Performance counter stats for 'ls':

      12.298610  task clock ticks     (msecs)

        3298477  CPU cycles           (events)
        1406354  instructions         (events)
         749035  cache references     (events)
          16939  cache misses         (events)
         100589  branches             (events)
          11159  branch misses        (events)
       7.627540  cpu clock ticks      (msecs)
      12.298610  task clock ticks     (msecs)
            500  pagefaults           (events)
              6  context switches     (events)
              3  CPU migrations       (events)

 Wall-clock time elapsed:     8.672290 msecs

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c          |  4 ++
 arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..edcde52bd17 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
+	/* Enable Performance counter for K7 and later */
+	if (c->x86 > 6 && c->x86 <= 0x11)
+		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a3c88529bb7..266618aa1a0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -73,6 +73,24 @@ static int pmc_intel_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const int amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+static int pmc_amd_event_map(int event)
+{
+	return amd_perfmon_event_map[event];
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-		hwc->irq_period = 0x7FFFFFFF;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
+			hwc->irq_period = 0x7FFFFFFF;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
+static u64 pmc_amd_save_disable_all(void)
+{
+	int idx;
+	u64 val, ctrl = 0;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			ctrl |= (1 << idx);
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+
+	return ctrl;
+}
+
 u64 hw_perf_save_disable(void)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
+static void pmc_amd_restore_all(u64 ctrl)
+{
+	u64 val;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		if (ctrl & (1 << idx)) {
+			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
+	}
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -1;
+
 	if (unlikely(hwc->nmi))
 		return -1;
 
@@ -401,6 +453,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -411,6 +464,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+	}
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -588,6 +642,9 @@ void perf_counter_unthrottle(void)
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return;
+
 	if (unlikely(!perf_counters_initialized))
 		return;
 
@@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = {
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
+static struct pmc_x86_ops pmc_amd_ops = {
+	.save_disable_all	= pmc_amd_save_disable_all,
+	.restore_all		= pmc_amd_restore_all,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.event_map		= pmc_amd_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+};
+
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
@@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	return &pmc_intel_ops;
 }
 
+static struct pmc_x86_ops *pmc_amd_init(void)
+{
+	nr_counters_generic = 4;
+	nr_counters_fixed = 0;
+
+	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+
+	return &pmc_amd_ops;
+}
+
 void __init init_hw_perf_counters(void)
 {
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
 		break;
+	case X86_VENDOR_AMD:
+		pmc_ops = pmc_amd_init();
+		break;
 	}
 	if (!pmc_ops)
 		return;
-- 
cgit v1.2.3-70-g09d2


From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:37:49 +0530
Subject: x86: decent declarations in perf_counter.c

Impact: cleanup

making decent declrations for struct pmc_x86_ops and
fix checkpatch error:
 ERROR: Macros with complex values should be enclosed in parenthesis

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 266618aa1a0..a1f3646a3e8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -42,12 +42,12 @@ struct cpu_hw_counters {
  * struct pmc_x86_ops - performance counter x86 ops
  */
 struct pmc_x86_ops {
-	u64 (*save_disable_all)		(void);
-	void (*restore_all)		(u64 ctrl);
-	unsigned eventsel;
-	unsigned perfctr;
-	int (*event_map)		(int event);
-	int max_events;
+	u64		(*save_disable_all)(void);
+	void		(*restore_all)(u64 ctrl);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	int		(*event_map)(int event);
+	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
@@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
-#define PERFMON_MAX_INTERRUPTS 100000/HZ
+#define PERFMON_MAX_INTERRUPTS (100000/HZ)
 
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
-- 
cgit v1.2.3-70-g09d2


From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:45:39 +0530
Subject: x86: use pr_info in perf_counter.c

Impact: cleanup

using pr_info in perf_counter.c fixes various 80 characters warnings and
also indenting for conditional statement

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a1f3646a3e8..3b65f19a668 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -454,18 +454,18 @@ void perf_counter_print_debug(void)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-	printk(KERN_INFO "\n");
-	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
-	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
-	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
@@ -473,17 +473,17 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
 	for (idx = 0; idx < nr_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
-		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
 	local_irq_enable();
@@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
-	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+	pr_info("Intel Performance Monitoring support detected.\n");
+	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... bit width:       %d\n", eax.split.bit_width);
+	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 
-	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+	pr_info("AMD Performance Monitoring support detected.\n");
 
 	return &pmc_amd_ops;
 }
@@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void)
 	if (!pmc_ops)
 		return;
 
-	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
+	pr_info("... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", counter_value_mask);
 
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
 
 	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
+	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
cgit v1.2.3-70-g09d2


From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Mar 2009 18:08:27 +0100
Subject: perfcounters: IRQ and NMI support on AMD CPUs

The below completes the K7+ performance counter support:

 - IRQ support
 - NMI support

KernelTop output works now as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1236273633.5187.286.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 272 +++++++++++++++++++++++++++++++------
 1 file changed, 228 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3b65f19a668..6ebe9abf6ae 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 static u64 counter_value_mask __read_mostly;
+static int counter_value_bits __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -35,7 +36,9 @@ struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			global_enable;
+	u64			throttle_ctrl;
+	u64			active_mask;
+	int			enabled;
 };
 
 /*
@@ -43,21 +46,28 @@ struct cpu_hw_counters {
  */
 struct pmc_x86_ops {
 	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64 ctrl);
+	void		(*restore_all)(u64);
+	u64		(*get_status)(u64);
+	void		(*ack_status)(u64);
+	void		(*enable)(int, u64);
+	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
-	int		(*event_map)(int event);
+	u64		(*event_map)(int);
+	u64		(*raw_event)(u64);
 	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
 
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+	.enabled = 1,
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
-static const int intel_perfmon_event_map[] =
+static const u64 intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static int pmc_intel_event_map(int event)
+static u64 pmc_intel_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
+static u64 pmc_intel_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+
+#define CORE_EVNTSEL_MASK 		\
+	(CORE_EVNTSEL_EVENT_MASK |	\
+	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_COUNTER_MASK)
+
+	return event & CORE_EVNTSEL_MASK;
+}
+
 /*
  * AMD Performance Monitor K7 and later.
  */
-static const int amd_perfmon_event_map[] =
+static const u64 amd_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x0076,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static int pmc_amd_event_map(int event)
+static u64 pmc_amd_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
+static u64 pmc_amd_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+
+#define K7_EVNTSEL_MASK			\
+	(K7_EVNTSEL_EVENT_MASK |	\
+	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_COUNTER_MASK)
+
+	return event & K7_EVNTSEL_MASK;
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (hw_event->raw) {
-		hwc->config |= hw_event->type;
+		hwc->config |= pmc_ops->raw_event(hw_event->type);
 	} else {
 		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
@@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void)
 
 static u64 pmc_amd_save_disable_all(void)
 {
-	int idx;
-	u64 val, ctrl = 0;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	int enabled, idx;
+
+	enabled = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
+		u64 val;
+
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-			ctrl |= (1 << idx);
-		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
 	}
 
-	return ctrl;
+	return enabled;
 }
 
 u64 hw_perf_save_disable(void)
@@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void)
 
 	return pmc_ops->save_disable_all();
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static void pmc_intel_restore_all(u64 ctrl)
@@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl)
 
 static void pmc_amd_restore_all(u64 ctrl)
 {
-	u64 val;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
+	cpuc->enabled = ctrl;
+	barrier();
+	if (!ctrl)
+		return;
+
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (ctrl & (1 << idx)) {
+		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+			u64 val;
+
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl)
 
 	pmc_ops->restore_all(ctrl);
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static u64 pmc_intel_get_status(u64 mask)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static u64 pmc_amd_get_status(u64 mask)
+{
+	u64 status = 0;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		s64 val;
+
+		if (!(mask & (1 << idx)))
+			continue;
+
+		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
+		val <<= (64 - counter_value_bits);
+		if (val >= 0)
+			status |= (1 << idx);
+	}
+
+	return status;
+}
+
+static u64 hw_perf_get_status(u64 mask)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->get_status(mask);
+}
+
+static void pmc_intel_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void pmc_amd_ack_status(u64 ack)
+{
+}
+
+static void hw_perf_ack_status(u64 ack)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->ack_status(ack);
+}
+
+static void pmc_intel_enable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
+			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static void pmc_amd_enable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	if (cpuc->enabled)
+		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+}
+
+static void hw_perf_enable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->enable(idx, config);
+}
+
+static void pmc_intel_disable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
+}
+
+static void pmc_amd_disable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+
+}
+
+static void hw_perf_disable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->disable(idx, config);
+}
+
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
 		    struct hw_perf_counter *hwc, unsigned int __idx)
@@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
 	else
-		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		hw_perf_disable(idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
 	else
-		wrmsr(hwc->config_base + idx,
-		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		hw_perf_enable(idx, hwc->config);
 }
 
 static int
@@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	int ret = 0;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-
-	/* Disable counters globally */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-	ack_APIC_irq();
+	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
+	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -618,12 +773,12 @@ again:
 		}
 	}
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+	hw_perf_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -631,32 +786,27 @@ out:
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
+
+	return ret;
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
-	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return;
-
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
 			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
 	}
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
-	if (unlikely(cpuc->global_enable && !global_enable))
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 	cpuc->interrupts = 0;
 }
 
@@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	ack_APIC_irq();
 	__smp_perf_counter_interrupt(regs, 0);
-
 	irq_exit();
 }
 
@@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
+	int ret;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
 
-	if (likely(cmd != DIE_NMI_IPI))
+	default:
 		return NOTIFY_DONE;
+	}
 
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	__smp_perf_counter_interrupt(regs, 1);
+	ret = __smp_perf_counter_interrupt(regs, 1);
 
-	return NOTIFY_STOP;
+	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
@@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct pmc_x86_ops pmc_intel_ops = {
 	.save_disable_all	= pmc_intel_save_disable_all,
 	.restore_all		= pmc_intel_restore_all,
+	.get_status		= pmc_intel_get_status,
+	.ack_status		= pmc_intel_ack_status,
+	.enable			= pmc_intel_enable,
+	.disable		= pmc_intel_disable,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= pmc_intel_event_map,
+	.raw_event		= pmc_intel_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
 static struct pmc_x86_ops pmc_amd_ops = {
 	.save_disable_all	= pmc_amd_save_disable_all,
 	.restore_all		= pmc_amd_restore_all,
+	.get_status		= pmc_amd_get_status,
+	.ack_status		= pmc_amd_ack_status,
+	.enable			= pmc_amd_enable,
+	.disable		= pmc_amd_disable,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= pmc_amd_event_map,
+	.raw_event		= pmc_amd_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
@@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
+	u64 old;
+	int bits;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
+	counter_value_mask = ~0ULL;
+
+	rdmsrl(MSR_K7_PERFCTR0, old);
+	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	/*
+	 * read the truncated mask
+	 */
+	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	wrmsrl(MSR_K7_PERFCTR0, old);
+
+	bits = 32 + fls(counter_value_mask >> 32);
+	if (bits == 32)
+		bits = fls((u32)counter_value_mask);
+	counter_value_bits = bits;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
cgit v1.2.3-70-g09d2


From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2009 20:34:21 +0100
Subject: perfcounters: IRQ and NMI support on AMD CPUs, fix

The BKGD suggests that counter width on AMD CPUs is 48 for all
existing models (it certainly is for mine).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6ebe9abf6ae..f5853718d4d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-	counter_value_mask = ~0ULL;
-
-	rdmsrl(MSR_K7_PERFCTR0, old);
-	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	/*
-	 * read the truncated mask
-	 */
-	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	wrmsrl(MSR_K7_PERFCTR0, old);
-
-	bits = 32 + fls(counter_value_mask >> 32);
-	if (bits == 32)
-		bits = fls((u32)counter_value_mask);
-	counter_value_bits = bits;
+	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
+	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
cgit v1.2.3-70-g09d2


From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 8 Mar 2009 11:34:19 +0100
Subject: x86: perf_counter cleanup

Use and actual unsigned long bitmap instead of casting our way around.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
LKML-Reference: <1236508459.22914.3645.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f5853718d4d..1df421042b2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -37,7 +37,7 @@ struct cpu_hw_counters {
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	u64			active_mask;
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+		if (test_bit(idx, cpuc->active_mask)) {
 			u64 val;
 
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	set_bit(idx, cpuc->active_mask);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	clear_bit(idx, cpuc->active_mask);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
cgit v1.2.3-70-g09d2


From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 8 Mar 2009 17:09:49 +0530
Subject: x86: perf_counter cleanup

Remove unused variables and duplicate header file.

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1df421042b2..155bc3c239b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -17,7 +17,6 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
@@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
-	u64 old;
-	int bits;
-
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-- 
cgit v1.2.3-70-g09d2


From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:28 +0100
Subject: perf_counter: x86: fix 32-bit irq_period assumption

No need to assume the irq_period is 32bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 155bc3c239b..1cedc3468ce 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s32 period = hwc->irq_period;
+	s64 period = hwc->irq_period;
 	int err;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:30 +0100
Subject: perf_counter: add comment to barrier

We need to ensure the enabled=0 write happens before we
start disabling the actual counters, so that a pcm_amd_enable()
will not enable one underneath us.

I think the race is impossible anyway, we always balance the
ops within any one context and perform enable() with IRQs disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1cedc3468ce..a2e3b76bfdc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void)
 
 	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
+	/*
+	 * ensure we write the disable before we start disabling the
+	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 */
 	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-- 
cgit v1.2.3-70-g09d2


From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:31 +0100
Subject: perf_counter: x86: use ULL postfix for 64bit constants

Fix a build warning on 32bit machines by explicitly marking the
constants as 64-bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a2e3b76bfdc..22dab06c08a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event)
 
 static u64 pmc_intel_raw_event(u64 event)
 {
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
-#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
@@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event)
 
 static u64 pmc_amd_raw_event(u64 event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
-#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
-- 
cgit v1.2.3-70-g09d2


From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:33 +0100
Subject: perf_counter: provide pagefault software events

We use the generic software counter infrastructure to provide
page fault events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c |  3 +++
 arch/x86/mm/fault.c     |  3 +++
 kernel/perf_counter.c   | 53 +++----------------------------------------------
 3 files changed, 9 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac..eda5b0ca4af 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/perf_counter.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa..c8725752b6c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
 #include <linux/tty.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/perf_counter.h>
 
 #include <asm-generic/sections.h>
 
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/*
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index eeb1b46cf70..1773c5d7427 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1607,57 +1607,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
  * Software counter: page faults
  */
 
-#ifdef CONFIG_VM_EVENT_COUNTERS
-#define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
-#else
-#define cpu_page_faults()	0
-#endif
-
-static u64 get_page_faults(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->maj_flt + curr->min_flt;
-	return cpu_page_faults();
-}
-
-static void page_faults_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_page_faults(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void page_faults_perf_counter_read(struct perf_counter *counter)
-{
-	page_faults_perf_counter_update(counter);
-}
-
-static int page_faults_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
-	return 0;
-}
-
-static void page_faults_perf_counter_disable(struct perf_counter *counter)
-{
-	page_faults_perf_counter_update(counter);
-}
-
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= page_faults_perf_counter_enable,
-	.disable	= page_faults_perf_counter_disable,
-	.read		= page_faults_perf_counter_read,
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:34 +0100
Subject: perf_counter: provide major/minor page fault software events

Provide separate sw counters for major and minor page faults.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c      |  5 ++++-
 arch/x86/mm/fault.c          |  7 +++++--
 include/linux/perf_counter.h |  4 +++-
 kernel/perf_counter.c        | 22 +++++++++-------------
 4 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eda5b0ca4af..17bbf6f91fb 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -312,6 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -319,8 +320,10 @@ good_area:
 			preempt_enable();
 		}
 #endif
-	} else
+	} else {
 		current->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c8725752b6c..f2d3324d921 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1140,10 +1140,13 @@ good_area:
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR)
+	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-	else
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+	} else {
 		tsk->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 
 	check_v8086_mode(regs, address, tsk);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3fefc3b8150..4b14a8e9dbf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -49,8 +49,10 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 	PERF_COUNT_CPU_MIGRATIONS	= -5,
+	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
-	PERF_SW_EVENTS_MIN		= -6,
+	PERF_SW_EVENTS_MIN		= -8,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1773c5d7427..68950a3a52b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
+static const struct hw_perf_counter_ops perf_ops_generic = {
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
+};
+
 /*
  * Software counter: cpu wall time clock
  */
@@ -1603,16 +1609,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: page faults
- */
-
-static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= perf_swcounter_enable,
-	.disable	= perf_swcounter_disable,
-	.read		= perf_swcounter_read,
-};
-
 /*
  * Software counter: context switches
  */
@@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel))
-			hw_ops = &perf_ops_page_faults;
+	case PERF_COUNT_PAGE_FAULTS_MIN:
+	case PERF_COUNT_PAGE_FAULTS_MAJ:
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		if (!counter->hw_event.exclude_kernel)
-- 
cgit v1.2.3-70-g09d2


From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 08:59:21 +0100
Subject: perf_counter: fix crash on perfmon v1 systems

Impact: fix boot crash on Intel Perfmon Version 1 systems

Intel Perfmon v1 does not support the global MSRs, nor does
it offer the generalized MSR ranges. So support v2 and later
CPUs only.

Also mark pmc_ops as read-mostly - to avoid false cacheline
sharing.

Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 22dab06c08a..6cba9d47b71 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -57,12 +57,14 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops;
+static struct pmc_x86_ops *pmc_ops __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
+static __read_mostly int intel_perfmon_version;
+
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -613,7 +615,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+	if (intel_perfmon_version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = {
 
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
+	union cpuid10_edx edx;
 	union cpuid10_eax eax;
-	unsigned int ebx;
 	unsigned int unused;
-	union cpuid10_edx edx;
+	unsigned int ebx;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
+	intel_perfmon_version = eax.split.version_id;
+	if (intel_perfmon_version < 2)
+		return NULL;
+
 	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-- 
cgit v1.2.3-70-g09d2


From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 16 Mar 2009 21:00:00 +1100
Subject: perf_counter: abstract wakeup flag setting in core to fix powerpc
 build

Impact: build fix for powerpc

Commit bd753921015e7905 ("perf_counter: software counter event
infrastructure") introduced a use of TIF_PERF_COUNTERS into the core
perfcounter code.  This breaks the build on powerpc because we use
a flag in a per-cpu area to signal wakeups on powerpc rather than
a thread_info flag, because the thread_info flags have to be
manipulated with atomic operations and are thus slower than per-cpu
flags.

This fixes the by changing the core to use an abstracted
set_perf_counter_pending() function, which is defined on x86 to set
the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag
(paca->perf_counter_pending).  It changes the previous powerpc
definition of set_perf_counter_pending to not take an argument and
adds a clear_perf_counter_pending, so as to simplify the definition
on x86.

On x86, set_perf_counter_pending() is defined as a macro.  Defining
it as a static inline in arch/x86/include/asm/perf_counters.h causes
compile failures because <asm/perf_counters.h> gets included early in
<linux/sched.h>, and the definitions of set_tsk_thread_flag etc. are
therefore not available in <asm/perf_counters.h>.  (On powerpc this
problem is avoided by defining set_perf_counter_pending etc. in
<asm/hw_irq.h>.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_irq.h   | 14 +++++++++++---
 arch/powerpc/kernel/irq.c           | 11 +++--------
 arch/powerpc/kernel/perf_counter.c  |  3 +--
 arch/x86/include/asm/perf_counter.h |  3 +++
 kernel/perf_counter.c               |  2 +-
 5 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b43076ff92c..cb32d571c9c 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -142,10 +142,17 @@ static inline unsigned long get_perf_counter_pending(void)
 	return x;
 }
 
-static inline void set_perf_counter_pending(int x)
+static inline void set_perf_counter_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
-		"r" (x),
+		"r" (1),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+static inline void clear_perf_counter_pending(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (0),
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
@@ -158,7 +165,8 @@ static inline unsigned long get_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(int x) {}
+static inline void set_perf_counter_pending(void) {}
+static inline void clear_perf_counter_pending(void) {}
 static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0d2e37c5773..469e9635ff0 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,13 +104,6 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-#ifdef CONFIG_PERF_COUNTERS
-notrace void __weak perf_counter_do_pending(void)
-{
-	set_perf_counter_pending(0);
-}
-#endif
-
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -142,8 +135,10 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending())
+	if (get_perf_counter_pending()) {
+		clear_perf_counter_pending();
 		perf_counter_do_pending();
+	}
 
 	/*
 	 * if (get_paca()->hard_enabled) return;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e33d27cd46..5008762e8bf 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -653,7 +653,6 @@ void perf_counter_do_pending(void)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 
-	set_perf_counter_pending(0);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter && counter->wakeup_pending) {
@@ -811,7 +810,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			perf_counter_do_pending();
 			irq_exit();
 		} else {
-			set_perf_counter_pending(1);
+			set_perf_counter_pending();
 		}
 	}
 }
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 2e08ed73664..1662043b340 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
+#define set_perf_counter_pending()	\
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0018c5e8124..b39456ad74a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1433,7 +1433,7 @@ static void perf_swcounter_interrupt(struct perf_counter *counter,
 
 	if (nmi) {
 		counter->wakeup_pending = 1;
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		set_perf_counter_pending();
 	} else
 		wake_up(&counter->waitq);
 }
-- 
cgit v1.2.3-70-g09d2


From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:18 +0100
Subject: perf_counter: revamp syscall input ABI

Impact: modify ABI

The hardware/software classification in hw_event->type became a little
strained due to the addition of tracepoint tracing.

Instead split up the field and provide a type field to explicitly specify
the counter type, while using the event_id field to specify which event to
use.

Raw counters still work as before, only the raw config now goes into
raw_event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.836807573@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  4 +-
 arch/x86/kernel/cpu/perf_counter.c | 10 ++--
 include/linux/perf_counter.h       | 95 ++++++++++++++++++++++++--------------
 kernel/perf_counter.c              | 83 ++++++++++++++++++++-------------
 4 files changed, 117 insertions(+), 75 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5008762e8bf..26f69dc7130 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.type;
+	ev = counter->hw_event.event_id;
 	if (!counter->hw_event.raw) {
 		if (ev >= ppmu->n_generic ||
 		    ppmu->generic_events[ev] == 0)
@@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, sub->hw_event.event_config);
 		perf_store_irq_data(counter, atomic64_read(&sub->count));
 	}
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cba9d47b71..d844ae41d5a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw) {
-		hwc->config |= pmc_ops->raw_event(hw_event->type);
+	if (hw_event->raw_type) {
+		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
 	} else {
-		if (hw_event->type >= pmc_ops->max_events)
+		if (hw_event->event_id >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->type);
+		hwc->config |= pmc_ops->event_map(hw_event->event_id);
 	}
 	counter->wakeup_pending = 0;
 
@@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, counter->hw_event.event_config);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 065984c1ff5..8f939490550 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -21,56 +21,81 @@
  */
 
 /*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
+ * hw_event.type
  */
-enum hw_event_types {
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+
 	/*
-	 * Common hardware events, generalized by the kernel:
+	 * available TYPE space, raw is the max value.
 	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  7,
+	PERF_TYPE_RAW			= 128,
+};
 
+/*
+ * Generalized performance counter event types, used by the hw_event.event_id
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_ids {
 	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
-
-	PERF_SW_EVENTS_MIN		= -8,
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
+
+	PERF_HW_EVENTS_MAX		= 7,
+};
 
-	PERF_TP_EVENTS_MIN		= -65536
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_SW_EVENTS_MAX		= 7,
 };
 
 /*
  * IRQ-notification data record type:
  */
 enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
+	PERF_RECORD_SIMPLE		= 0,
+	PERF_RECORD_IRQ			= 1,
+	PERF_RECORD_GROUP		= 2,
 };
 
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	__s64			type;
+	union {
+		struct {
+			__u64			event_id	: 56,
+						type		:  8;
+		};
+		struct {
+			__u64			raw_event_id	: 63,
+						raw_type	:  1;
+		};
+		__u64		event_config;
+	};
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -78,7 +103,6 @@ struct perf_counter_hw_event {
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -87,7 +111,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1   : 54;
+				__reserved_1   : 55;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
+	return !counter->hw_event.raw_type &&
+		counter->hw_event.type != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
 #else
 static inline void
@@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+static inline void perf_swcounter_event(u32 event, u64 nr,
 					int nmi, struct pt_regs *regs)	{ }
 #endif
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bbe3e45ba0..68a56a68bc7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_save_and_restart(struct perf_counter *counter)
-{
-	perf_swcounter_update(counter);
-	perf_swcounter_set_period(counter);
-}
-
 static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
 {
 	struct perf_data *irqdata = counter->irqdata;
@@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
 }
@@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
 	perf_swcounter_interrupt(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum hw_event_types event,
-				struct pt_regs *regs)
+				enum perf_event_types type,
+				u32 event, struct pt_regs *regs)
 {
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw)
+	if (counter->hw_event.raw_type)
+		return 0;
+
+	if (counter->hw_event.type != type)
 		return 0;
 
-	if (counter->hw_event.type != event)
+	if (counter->hw_event.event_id != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum hw_event_types event, u64 nr,
-				     int nmi, struct pt_regs *regs)
+				     enum perf_event_types type, u32 event,
+				     u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, event, regs))
+		if (perf_swcounter_match(counter, type, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 	rcu_read_unlock();
 }
 
-void perf_swcounter_event(enum hw_event_types event, u64 nr,
-			  int nmi, struct pt_regs *regs)
+static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
-	if (cpuctx->task_ctx)
-		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	if (cpuctx->task_ctx) {
+		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
+				nr, nmi, regs);
+	}
 
 	put_cpu_var(perf_cpu_context);
 }
 
+void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+{
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+}
+
 static void perf_swcounter_read(struct perf_counter *counter)
 {
 	perf_swcounter_update(counter);
@@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
-			task_pt_regs(current));
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
+
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
 }
 
 extern int ftrace_profile_enable(int);
@@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
-
-	ftrace_profile_disable(event_id);
+	ftrace_profile_disable(counter->hw_event.event_id);
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int event_id = counter->hw_event.event_id;
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
+	counter->hw.irq_period = counter->hw_event.irq_period;
 
 	return &perf_ops_generic;
 }
@@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.type) {
+	switch (counter->hw_event.event_id) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter)
 		if (!counter->hw_event.exclude_kernel)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
-	default:
-		hw_ops = tp_perf_counter_init(counter);
-		break;
 	}
 
 	if (hw_ops)
@@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	hw_ops = NULL;
-	if (!hw_event->raw && hw_event->type < 0)
-		hw_ops = sw_perf_counter_init(counter);
-	else
+
+	if (hw_event->raw_type)
+		hw_ops = hw_perf_counter_init(counter);
+	else switch (hw_event->type) {
+	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_SOFTWARE:
+		hw_ops = sw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_TRACEPOINT:
+		hw_ops = tp_perf_counter_init(counter);
+		break;
+	}
 
 	if (!hw_ops) {
 		kfree(counter);
-- 
cgit v1.2.3-70-g09d2


From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:19 +0100
Subject: perf_counter: unify irq output code

Impact: cleanup

Having 3 slightly different copies of the same code around does nobody
any good. First step in revamping the output format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.929962222@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  51 +-----------------
 arch/x86/kernel/cpu/perf_counter.c |  53 +------------------
 include/linux/perf_counter.h       |   2 +
 kernel/perf_counter.c              | 106 ++++++++++++++++++++-----------------
 4 files changed, 61 insertions(+), 151 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 26f69dc7130..88b72eb4af1 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -662,41 +662,6 @@ void perf_counter_do_pending(void)
 	}
 }
 
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-	struct perf_counter *leader, *sub;
-
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.event_config);
-		perf_store_irq_data(counter, atomic64_read(&sub->count));
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record) {
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			break;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			counter->wakeup_pending = 1;
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter);
-			counter->wakeup_pending = 1;
-			break;
-		}
-	}
+	if (record)
+		perf_counter_output(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d844ae41d5a..902282d68b0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 }
 
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
 /*
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
@@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
-static void
-perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	/*
-	 * Store sibling timestamps (if any):
-	 */
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-
-		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.event_config);
-		perf_store_irq_data(sibling, atomic64_read(&counter->count));
-	}
-}
-
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
@@ -754,28 +724,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			continue;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter, &status, &ack);
-			break;
-		}
-		/*
-		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these generic as
-		 * wakeup_pending and initate a wakeup callback:
-		 */
-		if (nmi) {
-			counter->wakeup_pending = 1;
-			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
-		} else {
-			wake_up(&counter->waitq);
-		}
+		perf_counter_output(counter, nmi, regs);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8f939490550..a4b76c0175f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+extern void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68a56a68bc7..f054b8c9bf9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1353,6 +1353,60 @@ static const struct file_operations perf_fops = {
 	.compat_ioctl		= perf_ioctl,
 };
 
+/*
+ * Output
+ */
+
+static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_counter_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+	}
+}
+
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
+{
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		return;
+
+	case PERF_RECORD_IRQ:
+		perf_counter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_counter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_perf_counter_pending();
+	} else
+		wake_up(&counter->waitq);
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-static void perf_swcounter_handle_group(struct perf_counter *sibling)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
-		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
-	}
-}
-
-static void perf_swcounter_interrupt(struct perf_counter *counter,
-				     int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		break;
-
-	case PERF_RECORD_IRQ:
-		perf_swcounter_store_irq(counter, instruction_pointer(regs));
-		break;
-
-	case PERF_RECORD_GROUP:
-		perf_swcounter_handle_group(counter);
-		break;
-	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
-}
-
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	struct perf_counter *counter;
@@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs)
-		perf_swcounter_interrupt(counter, 0, regs);
+		perf_counter_output(counter, 0, regs);
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
@@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_swcounter_interrupt(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v1.2.3-70-g09d2


From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:06 +0100
Subject: perf_counter: remove the event config bitfields

Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.059499915@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  6 ++--
 arch/x86/kernel/cpu/perf_counter.c |  8 ++---
 include/linux/perf_counter.h       | 74 +++++++++++++++++++++++++-------------
 kernel/perf_counter.c              | 22 +++++++-----
 4 files changed, 70 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6413d9c0313..d05651584d4 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	if (!counter->hw_event.raw_type) {
-		ev = counter->hw_event.event_id;
+	if (!perf_event_raw(&counter->hw_event)) {
+		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = counter->hw_event.raw_event_id;
+		ev = perf_event_config(&counter->hw_event);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 902282d68b0..3f95b0cdc55 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw_type) {
-		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+	if (perf_event_raw(hw_event)) {
+		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
 	} else {
-		if (hw_event->event_id >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->event_id);
+		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 	counter->wakeup_pending = 0;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 98f5990be1e..56099e52970 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -82,32 +82,37 @@ enum perf_counter_record_type {
 	PERF_RECORD_GROUP		= 2,
 };
 
+#define __PERF_COUNTER_MASK(name) 			\
+	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
+	 PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW_BITS		1
+#define PERF_COUNTER_RAW_SHIFT		63
+#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
+
+#define PERF_COUNTER_CONFIG_BITS	63
+#define PERF_COUNTER_CONFIG_SHIFT	0
+#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
+
+#define PERF_COUNTER_TYPE_BITS		7
+#define PERF_COUNTER_TYPE_SHIFT		56
+#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
+
+#define PERF_COUNTER_EVENT_BITS		56
+#define PERF_COUNTER_EVENT_SHIFT	0
+#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	union {
-#ifndef __BIG_ENDIAN_BITFIELD
-		struct {
-			__u64			event_id	: 56,
-						type		:  8;
-		};
-		struct {
-			__u64			raw_event_id	: 63,
-						raw_type	:  1;
-		};
-#else
-		struct {
-			__u64			type		:  8,
-						event_id	: 56;
-		};
-		struct {
-			__u64			raw_type	:  1,
-						raw_event_id	: 63;
-		};
-#endif /* __BIT_ENDIAN_BITFIELD */
-		__u64		event_config;
-	};
+	/*
+	 * The MSB of the config word signifies if the rest contains cpu
+	 * specific (raw) counter configuration data, if unset, the next
+	 * 7 bits are an event type and the rest of the bits are the event
+	 * identifier.
+	 */
+	__u64			config;
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -157,6 +162,27 @@ struct perf_counter_hw_event {
 
 struct task_struct;
 
+static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_RAW_MASK;
+}
+
+static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+}
+
+static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+{
+	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+		PERF_COUNTER_TYPE_SHIFT;
+}
+
+static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+}
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw_type &&
-		counter->hw_event.type != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->hw_event) &&
+		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f054b8c9bf9..ca14fc41ccd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, sub->hw_event.config);
 		perf_counter_store_irq(counter, atomic64_read(&sub->count));
 	}
 }
@@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw_type)
+	if (perf_event_raw(&counter->hw_event))
 		return 0;
 
-	if (counter->hw_event.type != type)
+	if (perf_event_type(&counter->hw_event) != type)
 		return 0;
 
-	if (counter->hw_event.event_id != event)
+	if (perf_event_id(&counter->hw_event) != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(counter->hw_event.event_id);
+	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.event_id;
+	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.event_id) {
+	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	hw_ops = NULL;
 
-	if (hw_event->raw_type)
+	if (perf_event_raw(hw_event)) {
 		hw_ops = hw_perf_counter_init(counter);
-	else switch (hw_event->type) {
+		goto done;
+	}
+
+	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
 		break;
@@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		kfree(counter);
 		return NULL;
 	}
+done:
 	counter->hw_ops = hw_ops;
 
 	return counter;
-- 
cgit v1.2.3-70-g09d2


From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:02 +0200
Subject: perf_counter: unify and fix delayed counter wakeup

While going over the wakeup code I noticed delayed wakeups only work
for hardware counters but basically all software counters rely on
them.

This patch unifies and generalizes the delayed wakeup to fix this
issue.

Since we're dealing with NMI context bits here, use a cmpxchg() based
single link list implementation to track counters that have pending
wakeups.

[ This should really be generic code for delayed wakeups, but since we
  cannot use cmpxchg()/xchg() in generic code, I've let it live in the
  perf_counter code. -- Eric Dumazet could use it to aggregate the
  network wakeups. ]

Furthermore, the x86 method of using TIF flags was flawed in that its
quite possible to end up setting the bit on the idle task, loosing the
wakeup.

The powerpc method uses per-cpu storage and does appear to be
sufficient.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.153932974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/hw_irq.h   |   4 +-
 arch/powerpc/kernel/irq.c           |   2 +-
 arch/powerpc/kernel/perf_counter.c  |  22 +------
 arch/x86/include/asm/perf_counter.h |   5 +-
 arch/x86/include/asm/thread_info.h  |   4 +-
 arch/x86/kernel/cpu/perf_counter.c  |  29 --------
 arch/x86/kernel/signal.c            |   6 --
 include/linux/perf_counter.h        |  15 +++--
 kernel/perf_counter.c               | 128 +++++++++++++++++++++++++++++++++---
 kernel/timer.c                      |   3 +
 10 files changed, 142 insertions(+), 76 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index cb32d571c9c..20a44d0c9fd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	unsigned long x;
 
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
 
 #else
 
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	return 0;
 }
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 469e9635ff0..2cd471f92fe 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending()) {
+	if (test_perf_counter_pending()) {
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
 	}
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df007fe0cc0..cde720fc495 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return &power_perf_ops;
 }
 
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter && counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 	long val;
-	int need_wakeup = 0, found = 0;
+	int found = 0;
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * immediately; otherwise we'll have do the wakeup when interrupts
 	 * get soft-enabled.
 	 */
-	if (get_perf_counter_pending() && regs->softe) {
+	if (test_perf_counter_pending() && regs->softe) {
 		irq_enter();
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 1662043b340..e2b0e66b235 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,8 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	\
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+#define set_perf_counter_pending()	do { } while (0)
+#define clear_perf_counter_pending()	do { } while (0)
+#define test_perf_counter_pending()	(0)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ffd5d2a367..8820a73ae09 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,7 +83,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f95b0cdc55..7aab177fb56 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		 */
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
-	counter->wakeup_pending = 0;
 
 	return 0;
 }
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-/*
- * This handler is triggered by NMI contexts:
- */
-void perf_counter_notify(struct pt_regs *regs)
-{
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int bit, cpu;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		if (!counter)
-			continue;
-
-		if (counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 611615a92c9..0a813b17b17 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
-	if (thread_info_flags & _TIF_PERF_COUNTERS) {
-		clear_thread_flag(TIF_PERF_COUNTERS);
-		perf_counter_notify(regs);
-	}
-
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bf67ce1762..0d833228eee 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -275,6 +275,10 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
+struct perf_wakeup_entry {
+	struct perf_wakeup_entry *next;
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -350,7 +354,7 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
-	int				wakeup_pending;
+	struct perf_wakeup_entry	wakeup;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
@@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
@@ -461,7 +465,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
-static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)				{ }
@@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(u32 event, u64 nr,
-					int nmi, struct pt_regs *regs)	{ }
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7988c..f70ff80e79d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
 	kfree(counter);
 }
 
+static void perf_pending_sync(struct perf_counter *counter);
+
 static void free_counter(struct perf_counter *counter)
 {
+	perf_pending_sync(counter);
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = {
 	.mmap			= perf_mmap,
 };
 
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data) {
+		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		__perf_counter_update_userpage(counter, data);
+	}
+	rcu_read_unlock();
+
+	wake_up_all(&counter->waitq);
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+	PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_counter *counter)
+{
+	struct perf_wakeup_entry **head;
+	struct perf_wakeup_entry *prev, *next;
+
+	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+		return;
+
+	head = &get_cpu_var(perf_wakeup_head);
+
+	do {
+		prev = counter->wakeup.next = *head;
+		next = &counter->wakeup;
+	} while (cmpxchg(head, prev, next) != prev);
+
+	set_perf_counter_pending();
+
+	put_cpu_var(perf_wakeup_head);
+}
+
+static int __perf_pending_run(void)
+{
+	struct perf_wakeup_entry *list;
+	int nr = 0;
+
+	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	while (list != PENDING_TAIL) {
+		struct perf_counter *counter = container_of(list,
+				struct perf_counter, wakeup);
+
+		list = list->next;
+
+		counter->wakeup.next = NULL;
+		/*
+		 * Ensure we observe the unqueue before we issue the wakeup,
+		 * so that we won't be waiting forever.
+		 * -- see perf_not_pending().
+		 */
+		smp_wmb();
+
+		perf_counter_wakeup(counter);
+		nr++;
+	}
+
+	return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+	/*
+	 * If we flush on whatever cpu we run, there is a chance we don't
+	 * need to wait.
+	 */
+	get_cpu();
+	__perf_pending_run();
+	put_cpu();
+
+	/*
+	 * Ensure we see the proper queue state before going to sleep
+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
+	 */
+	smp_rmb();
+	return counter->wakeup.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+	wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+	__perf_pending_run();
+}
+
 /*
  * Output
  */
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
-		(void)atomic_xchg(&handle->data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(handle->counter, handle->data);
-		if (nmi) {
-			handle->counter->wakeup_pending = 1;
-			set_perf_counter_pending();
-		} else
-			wake_up(&handle->counter->waitq);
+		if (nmi)
+			perf_pending_queue(handle->counter);
+		else
+			perf_counter_wakeup(handle->counter);
 	}
 	rcu_read_unlock();
 }
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
-	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 	counter->ctx			= ctx;
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e..672ca25fbc4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+	perf_counter_do_pending();
+
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
-- 
cgit v1.2.3-70-g09d2


From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:09 +0200
Subject: perf_counter: x86: proper error propagation for the x86
 hw_perf_counter_init()

Now that Paul cleaned up the error propagation paths, pass down the
x86 error as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.792822360@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7aab177fb56..b8885ccd804 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	err = __hw_perf_counter_init(counter);
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 
 	return &x86_perf_counter_ops;
 }
-- 
cgit v1.2.3-70-g09d2


From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:15 +0200
Subject: perf_counter: x86: callchain support

Provide the x86 perf_callchain() implementation.

Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Soeren Sandmann Pedersen <sandmann@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Orig-LKML-Reference: <20090330171024.341993293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b8885ccd804..e16dfafc6d7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/uaccess.h>
 
 #include <asm/apic.h>
+#include <asm/stacktrace.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	return &x86_perf_counter_ops;
 }
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+	if (entry->nr < MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Don't bother with IRQ stacks for now */
+	return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	unsigned long bp;
+	char *stack;
+
+	callchain_store(entry, instruction_pointer(regs));
+
+	stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+	bp = frame_pointer(regs);
+#else
+	bp = 0;
+#endif
+
+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+}
+
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	regs = (struct pt_regs *)current->thread.sp0 - 1;
+	fp   = (void __user *)regs->bp;
+
+	callchain_store(entry, regs->ip);
+
+	while (entry->nr < MAX_STACK_DEPTH) {
+		frame.next_fp	     = NULL;
+		frame.return_address = 0;
+
+		if (!copy_stack_frame(fp, &frame))
+			break;
+
+		if ((unsigned long)fp < user_stack_pointer(regs))
+			break;
+
+		callchain_store(entry, frame.return_address);
+		fp = frame.next_fp;
+	}
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	if (in_nmi())
+		entry = &__get_cpu_var(nmi_entry);
+	else
+		entry = &__get_cpu_var(irq_entry);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
-- 
cgit v1.2.3-70-g09d2


From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:16 +0200
Subject: perf_counter: pmc arbitration

Follow the example set by powerpc and try to play nice with oprofile
and the nmi watchdog.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.459968444@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16dfafc6d7..2a946a160ca 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -20,6 +20,7 @@
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
+#include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -172,6 +173,65 @@ again:
 	atomic64_sub(delta, &hwc->period_left);
 }
 
+static atomic_t num_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		disable_lapic_nmi_watchdog();
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+			goto eventsel_fail;
+	}
+
+	return true;
+
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+
+	i = nr_counters_generic;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+
+	return false;
+}
+
+static void release_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
+	int err;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
+	err = 0;
+	if (atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	if (err)
+		return err;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 
+	counter->destroy = hw_perf_counter_destroy;
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:03 +0200
Subject: perf_counter: add more context information

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.493101305@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++
 include/linux/perf_counter.h       | 4 ++--
 kernel/perf_counter.c              | 2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2a946a160ca..c74e20d593a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5428ba120d7..90cce0c74a0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH		254
 
 struct perf_callchain_entry {
-	u64	nr;
+	u32	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9bcab10e735..f105a6e696c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
+			callchain_size = (2 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
cgit v1.2.3-70-g09d2


From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:03 +0200
Subject: perf_counter: x86: self-IPI for pending work

Implement set_perf_counter_pending() with a self-IPI so that it will
run ASAP in a usable context.

For now use a second IRQ vector, because the primary vector pokes
the apic in funny ways that seem to confuse things.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.724626696@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/entry_arch.h   |  1 +
 arch/x86/include/asm/hardirq.h      |  1 +
 arch/x86/include/asm/hw_irq.h       |  1 +
 arch/x86/include/asm/irq_vectors.h  |  5 +++++
 arch/x86/include/asm/perf_counter.h |  3 ++-
 arch/x86/kernel/cpu/perf_counter.c  | 14 ++++++++++++++
 arch/x86/kernel/entry_64.S          |  2 ++
 arch/x86/kernel/irq.c               |  5 +++++
 arch/x86/kernel/irqinit_32.c        |  1 +
 arch/x86/kernel/irqinit_64.c        |  1 +
 10 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf25..fe24d280249 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
 BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 25454427cee..f5ebe2aaca4 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 	unsigned int generic_irqs;	/* arch dependent */
 	unsigned int apic_perf_irqs;
+	unsigned int apic_pending_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ae80f64973e..7309c0ad690 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,7 @@ extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void perf_counter_interrupt(void);
+extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47..545bb811ccb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -116,6 +116,11 @@
  */
 #define GENERIC_INTERRUPT_VECTOR	0xed
 
+/*
+ * Performance monitoring pending work vector:
+ */
+#define LOCAL_PENDING_VECTOR		0xec
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index e2b0e66b235..d08dd52cb8f 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,7 +84,8 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	do { } while (0)
+extern void set_perf_counter_pending(void);
+
 #define clear_perf_counter_pending()	do { } while (0)
 #define test_perf_counter_pending()	(0)
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c74e20d593a..438415866fe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	perf_counter_do_pending();
+	irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3f129d963a0..1d46cba56fd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 #ifdef CONFIG_PERF_COUNTERS
 apicinterrupt LOCAL_PERF_VECTOR \
 	perf_counter_interrupt smp_perf_counter_interrupt
+apicinterrupt LOCAL_PENDING_VECTOR \
+	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9c2754302ec..d465487da58 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
+	seq_printf(p, "PND: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+	seq_printf(p, "  Performance pending work\n");
 #endif
 	if (generic_interrupt_extension) {
 		seq_printf(p, "PLT: ");
@@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
+	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
 	if (generic_interrupt_extension)
 		sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 925d87cfc55..3190a6b961e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -166,6 +166,7 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
 # ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 665e2ab48ab..53ceb26f80f 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -156,6 +156,7 @@ static void __init apic_intr_init(void)
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
 
-- 
cgit v1.2.3-70-g09d2


From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:04 +0200
Subject: perf_counter: theres more to overflow than writing events

Prepare for more generic overflow handling. The new perf_counter_overflow()
method will handle the generic bits of the counter overflow, and can return
a !0 return value, in which case the counter should be (soft) disabled, so
that it won't count until it's properly disabled.

XXX: do powerpc and swcounter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.812109629@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  2 +-
 arch/x86/kernel/cpu/perf_counter.c |  3 ++-
 include/linux/perf_counter.h       |  4 ++--
 kernel/perf_counter.c              | 29 +++++++++++++++++++++++------
 4 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0a4d14f279a..f88c35d0710 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_output(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 438415866fe..1116a41bc7b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,8 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		perf_counter_output(counter, nmi, regs);
+		if (perf_counter_overflow(counter, nmi, regs))
+			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 977fb15a53f..ca2d4df29e0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs);
+extern int perf_counter_overflow(struct perf_counter *counter,
+				 int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0a2ade2e4f1..195e976eb07 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
+static void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -2033,6 +2033,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
+/*
+ * Generic counter overflow handling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter,
+			  int nmi, struct pt_regs *regs)
+{
+	perf_counter_output(counter, nmi, regs);
+	return 0;
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
+	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
 
@@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			!counter->hw_event.exclude_user)
 		regs = task_pt_regs(current);
 
-	if (regs)
-		perf_counter_output(counter, 0, regs);
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, regs))
+			ret = HRTIMER_NORESTART;
+	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
-	return HRTIMER_RESTART;
+	return ret;
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
@@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_counter_output(counter, nmi, regs);
+	if (perf_counter_overflow(counter, nmi, regs))
+		/* soft-disable the counter */
+		;
+
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v1.2.3-70-g09d2


From 98c2aaf8be5baf7193be37fb28bce8e7327158bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Apr 2009 11:30:17 +0200
Subject: x86, perfcounters: add atomic64_xchg()

Complete atomic64_t support on the 32-bit side by adding atomic64_xch().

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090406094518.445450972@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/atomic_32.h | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 977250ed8b8..aff9f1fcdcd 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -291,19 +291,37 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
 }
 
 /**
- * atomic64_set - set atomic64 variable
+ * atomic64_xchg - xchg atomic64 variable
  * @ptr:      pointer to type atomic64_t
  * @new_val:  value to assign
+ * @old_val:  old value that was there
  *
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
  */
-static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
 {
 	unsigned long long old_val;
 
 	do {
 		old_val = atomic_read(ptr);
 	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	atomic64_xchg(ptr, new_val);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 25 Mar 2009 10:50:34 +0900
Subject: x86: smarten /proc/interrupts output for new counters

Now /proc/interrupts of tip tree has new counters:

  CNT: Performance counter interrupts

Format change of output, as like that by commit:

  commit 7a81d9a7da03d2f27840d659f97ef140d032f609
  x86: smarten /proc/interrupts output

should be applied to these new counters too.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d465487da58..dccaaa85578 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
-	seq_printf(p, "CNT: ");
+	seq_printf(p, "%*s: ", prec, "CNT");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-- 
cgit v1.2.3-70-g09d2


From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:33 +0200
Subject: perf_counter: allow for data addresses to be recorded

Paul suggested we allow for data addresses to be recorded along with
the traditional IPs as power can provide these.

For now, only the software pagefault events provide data addresses,
but in the future power might as well for some events.

x86 doesn't seem capable of providing this atm.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.394816925@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  2 +-
 arch/powerpc/mm/fault.c            |  8 ++++---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 arch/x86/mm/fault.c                |  8 ++++---
 include/linux/perf_counter.h       | 14 +++++++-----
 kernel/perf_counter.c              | 46 ++++++++++++++++++++++++--------------
 6 files changed, 49 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0697ade84dd..c9d019f1907 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs, 0);
 }
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 17bbf6f91fb..ac0e112031b 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,8 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -322,7 +323,8 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1116a41bc7b..0fcbaab83f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ again:
 			continue;
 
 		perf_save_and_restart(counter);
-		if (perf_counter_overflow(counter, nmi, regs))
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f2d3324d921..6f9df2babe4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,10 +1142,12 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 
 	check_v8086_mode(regs, address, tsk);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bd1be58c93..c22363a4f74 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -101,8 +101,9 @@ enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_GROUP	= 1U << 3,
-	PERF_RECORD_CALLCHAIN	= 1U << 4,
+	PERF_RECORD_ADDR	= 1U << 3,
+	PERF_RECORD_GROUP	= 1U << 4,
+	PERF_RECORD_CALLCHAIN	= 1U << 5,
 };
 
 /*
@@ -251,6 +252,7 @@ enum perf_event_type {
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
+	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs);
+				 int nmi, struct pt_regs *regs, u64 addr);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
@@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter)
 		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
@@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
 static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
-
+perf_swcounter_event(u32 event, u64 nr, int nmi,
+		     struct pt_regs *regs, u64 addr)			{ }
 
 static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4dc8600d282..321c57e3556 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	update_context_time(ctx);
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 }
 
 static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs)
+				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_ADDR) {
+		header.type |= PERF_RECORD_ADDR;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TIME)
 		perf_output_put(&handle, time);
 
+	if (record_type & PERF_RECORD_ADDR)
+		perf_output_put(&handle, addr);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
  */
 
 int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs)
+			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
@@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs, addr);
 	return ret;
 }
 
@@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs))
+		if (perf_counter_overflow(counter, 0, regs, 0))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct pt_regs *regs)
+				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs))
+	if (perf_counter_overflow(counter, nmi, regs, addr))
 		/* soft-disable the counter */
 		;
 
@@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct pt_regs *regs)
+			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 	if (counter->hw.irq_period && !neg)
-		perf_swcounter_overflow(counter, nmi, regs);
+		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_event_types type, u32 event,
-				     u64 nr, int nmi, struct pt_regs *regs)
+				     u64 nr, int nmi, struct pt_regs *regs,
+				     u64 addr)
 {
 	struct perf_counter *counter;
 
@@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, type, event, regs))
-			perf_swcounter_add(counter, nr, nmi, regs);
+			perf_swcounter_add(counter, nr, nmi, regs, addr);
 	}
 	rcu_read_unlock();
 }
@@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 }
 
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
-				   u64 nr, int nmi, struct pt_regs *regs)
+				   u64 nr, int nmi, struct pt_regs *regs,
+				   u64 addr)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 	(*recursion)++;
 	barrier();
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+				 nr, nmi, regs, addr);
 	if (cpuctx->task_ctx) {
 		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-				nr, nmi, regs);
+					 nr, nmi, regs, addr);
 	}
 
 	barrier();
@@ -2349,9 +2360,10 @@ out:
 	put_cpu_var(perf_cpu_context);
 }
 
-void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id)
 	if (!regs)
 		regs = task_pt_regs(current);
 
-	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
 }
 
 extern int ftrace_profile_enable(int);
-- 
cgit v1.2.3-70-g09d2


From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Mon, 13 Apr 2009 20:24:50 +0100
Subject: perf_counter: fix alignment in /proc/interrupts

Trivial fix on columns alignment in /proc/interrupts file.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090413192449.GA3920@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index dccaaa85578..849cfabb1fd 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-	seq_printf(p, "PND: ");
+	seq_printf(p, "%*s: ", prec, "PND");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
 	seq_printf(p, "  Performance pending work\n");
-- 
cgit v1.2.3-70-g09d2


From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:58 +0200
Subject: perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus

X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not
work on AMD CPUs. The flag is now only used in Intel specific code
(especially initialization).

[ Impact: refactor code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c          | 4 ----
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd69c514ca2..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
-	/* Enable Performance counter for K7 and later */
-	if (c->x86 > 6 && c->x86 <= 0x11)
-		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0fcbaab83f9..7d0f81dcb52 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	unsigned int unused;
 	unsigned int ebx;
 
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return NULL;
+
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
@@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 void __init init_hw_perf_counters(void)
 {
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
-- 
cgit v1.2.3-70-g09d2


From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:00 +0200
Subject: perf_counter, x86: add default path to cpu detection

This quits hw counter initialization immediately if no cpu is
detected.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d0f81dcb52..d6d6529349d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_AMD:
 		pmc_ops = pmc_amd_init();
 		break;
+	default:
+		return;
 	}
 	if (!pmc_ops)
 		return;
-- 
cgit v1.2.3-70-g09d2


From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:01 +0200
Subject: perf_counter, x86: rework pmc_amd_save_disable_all() and
 pmc_amd_restore_all()

MSR reads and writes are expensive. This patch adds checks to avoid
its usage where possible.

[ Impact: micro-optimization on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d6d6529349d..75a090394b6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void)
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		u64 val;
 
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
 	return enabled;
@@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, cpuc->active_mask)) {
-			u64 val;
+		u64 val;
 
-			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			continue;
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:02 +0200
Subject: perf_counter, x86: protect per-cpu variables with compile barriers
 only

Per-cpu variables needn't to be protected with cpu barriers
(smp_wmb()). Protection is only needed for preemption on the same cpu
(rescheduling or the nmi handler). This can be done using a compiler
barrier only.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75a090394b6..ad663d5ad2d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -673,7 +673,7 @@ try_generic:
 	/*
 	 * Make it visible before enabling the hw:
 	 */
-	smp_wmb();
+	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
 	 */
-	smp_wmb();
+	barrier();
 
 	/*
 	 * Drain the remaining delta count out of a counter
-- 
cgit v1.2.3-70-g09d2


From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:03 +0200
Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu

This patch renames struct hw_perf_counter_ops into struct pmu. It
introduces a structure to describe a cpu specific pmu (performance
monitoring unit). It may contain ops and data. The new name of the
structure fits better, is shorter, and thus better to handle. Where it
was appropriate, names of function and variable have been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 25 +++++++-------
 arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++-----------
 include/linux/perf_counter.h       |  9 +++--
 kernel/perf_counter.c              | 68 ++++++++++++++++++--------------------
 4 files changed, 66 insertions(+), 73 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd76d0fa2c3..d9bbe5efc64 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
 	return 0;
 }
 
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
 
@@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable)
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_perf_read(counter);
+			power_pmu_read(counter);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
 		}
@@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 	counter->oncpu = cpu;
 	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
-		counter->hw_ops->enable(counter);
+		counter->pmu->enable(counter);
 }
 
 /*
@@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
  * re-enable the PMU in order to get hw_perf_restore to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
@@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter)
 /*
  * Remove a counter from the PMU.
  */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
@@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_save(flags);
 	pmudis = hw_perf_save_disable();
 
-	power_perf_read(counter);
+	power_pmu_read(counter);
 
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
-struct hw_perf_counter_ops power_perf_ops = {
-	.enable = power_perf_enable,
-	.disable = power_perf_disable,
-	.read = power_perf_read
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
 };
 
 /* Number of perf_counters counting hardware events */
@@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	unsigned long ev;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
@@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	if (err)
 		return ERR_PTR(err);
-	return &power_perf_ops;
+	return &power_pmu;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ad663d5ad2d..95de980c74a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_generic_disable(struct perf_counter *counter,
-			   struct hw_perf_counter *hwc, unsigned int idx)
+__x86_pmu_disable(struct perf_counter *counter,
+		  struct hw_perf_counter *hwc, unsigned int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter,
 }
 
 static void
-__pmc_generic_enable(struct perf_counter *counter,
-			  struct hw_perf_counter *hwc, int idx)
+__x86_pmu_enable(struct perf_counter *counter,
+		 struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
@@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static int pmc_generic_enable(struct perf_counter *counter)
+static int x86_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -667,7 +667,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	/*
@@ -676,7 +676,7 @@ try_generic:
 	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__pmc_generic_enable(counter, hwc, idx);
+	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
 }
@@ -731,13 +731,13 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void pmc_generic_disable(struct perf_counter *counter)
+static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__pmc_generic_enable(counter, hwc, idx);
+		__x86_pmu_enable(counter, hwc, idx);
 }
 
 /*
@@ -805,7 +805,7 @@ again:
 
 		perf_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__pmc_generic_disable(counter, &counter->hw, bit);
+			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
@@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void pmc_generic_read(struct perf_counter *counter)
+static void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
-static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.enable		= pmc_generic_enable,
-	.disable	= pmc_generic_disable,
-	.read		= pmc_generic_read,
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
 };
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
@@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	if (err)
 		return ERR_PTR(err);
 
-	return &x86_perf_counter_ops;
+	return &pmu;
 }
 
 /*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index be10b3ffe32..c3db52dc876 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -334,9 +334,9 @@ struct hw_perf_counter {
 struct perf_counter;
 
 /**
- * struct hw_perf_counter_ops - performance counter hw ops
+ * struct pmu - generic performance monitoring unit
  */
-struct hw_perf_counter_ops {
+struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
@@ -381,7 +381,7 @@ struct perf_counter {
 	struct list_head		sibling_list;
 	int 				nr_siblings;
 	struct perf_counter		*group_leader;
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
 	enum perf_counter_active_state	prev_state;
@@ -519,8 +519,7 @@ struct perf_cpu_context {
  */
 extern int perf_max_counters;
 
-extern const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter);
+extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 09396098dd0..582108addef 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
@@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter,
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_stopped = ctx->time;
-	counter->hw_ops->disable(counter);
+	counter->pmu->disable(counter);
 	counter->oncpu = -1;
 
 	if (!is_software_counter(counter))
@@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter,
 	 */
 	smp_wmb();
 
-	if (counter->hw_ops->enable(counter)) {
+	if (counter->pmu->enable(counter)) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->oncpu = -1;
 		return -EAGAIN;
@@ -1096,7 +1095,7 @@ static void __read(void *info)
 	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 	update_counter_times(counter);
 	local_irq_restore(flags);
 }
@@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		leader = counter->group_leader;
 		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 			if (sub != counter)
-				sub->hw_ops->read(sub);
+				sub->pmu->read(sub);
 
 			group_entry.event = sub->hw_event.config;
 			group_entry.counter = atomic64_read(&sub->count);
@@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	struct pt_regs *regs;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 
 	regs = get_irq_regs();
 	/*
@@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_generic = {
+static const struct pmu perf_ops_generic = {
 	.enable		= perf_swcounter_enable,
 	.disable	= perf_swcounter_disable,
 	.read		= perf_swcounter_read,
@@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 	cpu_clock_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+static const struct pmu perf_ops_cpu_clock = {
 	.enable		= cpu_clock_perf_counter_enable,
 	.disable	= cpu_clock_perf_counter_disable,
 	.read		= cpu_clock_perf_counter_read,
@@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, time);
 }
 
-static const struct hw_perf_counter_ops perf_ops_task_clock = {
+static const struct pmu perf_ops_task_clock = {
 	.enable		= task_clock_perf_counter_enable,
 	.disable	= task_clock_perf_counter_disable,
 	.read		= task_clock_perf_counter_read,
@@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+static const struct pmu perf_ops_cpu_migrations = {
 	.enable		= cpu_migrations_perf_counter_enable,
 	.disable	= cpu_migrations_perf_counter_disable,
 	.read		= cpu_migrations_perf_counter_read,
@@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
@@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter)
 	return &perf_ops_generic;
 }
 #else
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
 #endif
 
-static const struct hw_perf_counter_ops *
-sw_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
-	const struct hw_perf_counter_ops *hw_ops = NULL;
+	const struct pmu *pmu = NULL;
 	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
@@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 		 * use the cpu_clock counter instead.
 		 */
 		if (counter->ctx->task)
-			hw_ops = &perf_ops_task_clock;
+			pmu = &perf_ops_task_clock;
 		else
-			hw_ops = &perf_ops_cpu_clock;
+			pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_generic;
+		pmu = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_cpu_migrations;
+			pmu = &perf_ops_cpu_migrations;
 		break;
 	}
 
-	if (hw_ops)
+	if (pmu)
 		hwc->irq_period = hw_event->irq_period;
 
-	return hw_ops;
+	return pmu;
 }
 
 /*
@@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu *pmu;
 	struct perf_counter *counter;
 	long err;
 
@@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
-	counter->hw_ops			= NULL;
+	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
-	hw_ops = NULL;
+	pmu = NULL;
 
 	if (perf_event_raw(hw_event)) {
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
 	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_SOFTWARE:
-		hw_ops = sw_perf_counter_init(counter);
+		pmu = sw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_TRACEPOINT:
-		hw_ops = tp_perf_counter_init(counter);
+		pmu = tp_perf_counter_init(counter);
 		break;
 	}
 done:
 	err = 0;
-	if (!hw_ops)
+	if (!pmu)
 		err = -EINVAL;
-	else if (IS_ERR(hw_ops))
-		err = PTR_ERR(hw_ops);
+	else if (IS_ERR(pmu))
+		err = PTR_ERR(pmu);
 
 	if (err) {
 		kfree(counter);
 		return ERR_PTR(err);
 	}
 
-	counter->hw_ops = hw_ops;
+	counter->pmu = pmu;
 
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
-- 
cgit v1.2.3-70-g09d2


From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:04 +0200
Subject: perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu

This patch renames struct pmc_x86_ops into struct x86_pmu. It
introduces a structure to describe an x86 model specific pmu
(performance monitoring unit). It may contain ops and data. The new
name of the structure fits better, is shorter, and thus better to
handle. Where it was appropriate, names of function and variable have
been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 95de980c74a..808a1a11346 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,9 +44,9 @@ struct cpu_hw_counters {
 };
 
 /*
- * struct pmc_x86_ops - performance counter x86 ops
+ * struct x86_pmu - generic x86 pmu
  */
-struct pmc_x86_ops {
+struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -60,7 +60,7 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops __read_mostly;
+static struct x86_pmu *x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static u64 pmc_intel_event_map(int event)
+static u64 intel_pmu_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
-static u64 pmc_intel_raw_event(u64 event)
+static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
@@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static u64 pmc_amd_event_map(int event)
+static u64 amd_pmu_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
-static u64 pmc_amd_raw_event(u64 event)
+static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(pmc_ops->perfctr + i);
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 pmc_intel_save_disable_all(void)
+static u64 intel_pmu_save_disable_all(void)
 {
 	u64 ctrl;
 
@@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
-static u64 pmc_amd_save_disable_all(void)
+static u64 amd_pmu_save_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int enabled, idx;
@@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void)
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
-	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 * counters proper, so that amd_pmu_enable_counter() does the
+	 * right thing.
 	 */
 	barrier();
 
@@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->save_disable_all();
+	return x86_pmu->save_disable_all();
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void pmc_intel_restore_all(u64 ctrl)
+static void intel_pmu_restore_all(u64 ctrl)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
-static void pmc_amd_restore_all(u64 ctrl)
+static void amd_pmu_restore_all(u64 ctrl)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
@@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->restore_all(ctrl);
+	x86_pmu->restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 pmc_intel_get_status(u64 mask)
+static u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
@@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask)
 	return status;
 }
 
-static u64 pmc_amd_get_status(u64 mask)
+static u64 amd_pmu_get_status(u64 mask)
 {
 	u64 status = 0;
 	int idx;
@@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->get_status(mask);
+	return x86_pmu->get_status(mask);
 }
 
-static void pmc_intel_ack_status(u64 ack)
+static void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void pmc_amd_ack_status(u64 ack)
+static void amd_pmu_ack_status(u64 ack)
 {
 }
 
@@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->ack_status(ack);
+	x86_pmu->ack_status(ack);
 }
 
-static void pmc_intel_enable(int idx, u64 config)
+static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
 			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void pmc_amd_enable(int idx, u64 config)
+static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->enable(idx, config);
+	x86_pmu->enable(idx, config);
 }
 
-static void pmc_intel_disable(int idx, u64 config)
+static void intel_pmu_disable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
 }
 
-static void pmc_amd_disable(int idx, u64 config)
+static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->disable(idx, config);
+	x86_pmu->disable(idx, config);
 }
 
 static inline void
@@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -661,8 +662,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = pmc_ops->eventsel;
-		hwc->counter_base = pmc_ops->perfctr;
+		hwc->config_base  = x86_pmu->eventsel;
+		hwc->counter_base = x86_pmu->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -710,8 +711,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
-		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-static struct pmc_x86_ops pmc_intel_ops = {
-	.save_disable_all	= pmc_intel_save_disable_all,
-	.restore_all		= pmc_intel_restore_all,
-	.get_status		= pmc_intel_get_status,
-	.ack_status		= pmc_intel_ack_status,
-	.enable			= pmc_intel_enable,
-	.disable		= pmc_intel_disable,
+static struct x86_pmu intel_pmu = {
+	.save_disable_all	= intel_pmu_save_disable_all,
+	.restore_all		= intel_pmu_restore_all,
+	.get_status		= intel_pmu_get_status,
+	.ack_status		= intel_pmu_ack_status,
+	.enable			= intel_pmu_enable_counter,
+	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= pmc_intel_event_map,
-	.raw_event		= pmc_intel_raw_event,
+	.event_map		= intel_pmu_event_map,
+	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
-static struct pmc_x86_ops pmc_amd_ops = {
-	.save_disable_all	= pmc_amd_save_disable_all,
-	.restore_all		= pmc_amd_restore_all,
-	.get_status		= pmc_amd_get_status,
-	.ack_status		= pmc_amd_ack_status,
-	.enable			= pmc_amd_enable,
-	.disable		= pmc_amd_disable,
+static struct x86_pmu amd_pmu = {
+	.save_disable_all	= amd_pmu_save_disable_all,
+	.restore_all		= amd_pmu_restore_all,
+	.get_status		= amd_pmu_get_status,
+	.ack_status		= amd_pmu_ack_status,
+	.enable			= amd_pmu_enable_counter,
+	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= pmc_amd_event_map,
-	.raw_event		= pmc_amd_raw_event,
+	.event_map		= amd_pmu_event_map,
+	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct pmc_x86_ops *pmc_intel_init(void)
+static struct x86_pmu *intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &pmc_intel_ops;
+	return &intel_pmu;
 }
 
-static struct pmc_x86_ops *pmc_amd_init(void)
+static struct x86_pmu *amd_pmu_init(void)
 {
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
@@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-	return &pmc_amd_ops;
+	return &amd_pmu;
 }
 
 void __init init_hw_perf_counters(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		pmc_ops = pmc_intel_init();
+		x86_pmu = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		pmc_ops = pmc_amd_init();
+		x86_pmu = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!pmc_ops)
+	if (!x86_pmu)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
cgit v1.2.3-70-g09d2


From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:05 +0200
Subject: perf_counter, x86: make interrupt handler model specific

This separates the perfcounter interrupt handler for AMD and Intel
cpus. The AMD interrupt handler implementation is a follow-on patch.

[ Impact: refactor and clean up code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 808a1a11346..9d90de0bd0b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -4,6 +4,7 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2009 Jaswinder Singh Rajput
+ *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -47,6 +48,7 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
+	/* disable temporarily */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -ENOSYS;
+
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
@@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
@@ -827,6 +833,8 @@ out:
 	return ret;
 }
 
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
@@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	__smp_perf_counter_interrupt(regs, 0);
+	x86_pmu->handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = __smp_perf_counter_interrupt(regs, 1);
+	ret = x86_pmu->handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
 	.get_status		= intel_pmu_get_status,
@@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
 	.get_status		= amd_pmu_get_status,
-- 
cgit v1.2.3-70-g09d2


From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:06 +0200
Subject: perf_counter, x86: remove get_status() from struct x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 39 +++++---------------------------------
 1 file changed, 5 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9d90de0bd0b..d0bb02919c6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	u64		(*get_status)(u64);
 	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
@@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static u64 amd_pmu_get_status(u64 mask)
-{
-	u64 status = 0;
-	int idx;
-
-	for (idx = 0; idx < nr_counters_generic; idx++) {
-		s64 val;
-
-		if (!(mask & (1 << idx)))
-			continue;
-
-		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
-		val <<= (64 - counter_value_bits);
-		if (val >= 0)
-			status |= (1 << idx);
-	}
-
-	return status;
-}
-
-static u64 hw_perf_get_status(u64 mask)
-{
 	if (unlikely(!perf_counters_initialized))
 		return 0;
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
-	return x86_pmu->get_status(mask);
+	return status;
 }
 
 static void intel_pmu_ack_status(u64 ack)
@@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
@@ -820,7 +793,7 @@ again:
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.get_status		= intel_pmu_get_status,
 	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
@@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.get_status		= amd_pmu_get_status,
 	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
-- 
cgit v1.2.3-70-g09d2


From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:07 +0200
Subject: perf_counter, x86: remove ack_status() from struct x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d0bb02919c6..6bbdc16cc69 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
@@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask)
 	return status;
 }
 
-static void intel_pmu_ack_status(u64 ack)
+static inline void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void amd_pmu_ack_status(u64 ack)
-{
-}
-
-static void hw_perf_ack_status(u64 ack)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	x86_pmu->ack_status(ack);
-}
-
 static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
@@ -788,7 +775,7 @@ again:
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
-	hw_perf_ack_status(ack);
+	intel_pmu_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
-- 
cgit v1.2.3-70-g09d2


From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:08 +0200
Subject: perf_counter, x86: rename __hw_perf_counter_set_period into
 x86_perf_counter_set_period

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6bbdc16cc69..fa6541d781b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * To be called with the counter disabled in hw:
  */
 static void
-__hw_perf_counter_set_period(struct perf_counter *counter,
+x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
@@ -642,7 +642,7 @@ try_generic:
 	 */
 	barrier();
 
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
@@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	int idx = hwc->idx;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__x86_pmu_enable(counter, hwc, idx);
-- 
cgit v1.2.3-70-g09d2


From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:09 +0200
Subject: perf_counter, x86: rename intel only functions

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fa6541d781b..5a52d73ccfa 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void perf_save_and_restart(struct perf_counter *counter)
+static void intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
@@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 	int ret = 0;
 
-	cpuc->throttle_ctrl = hw_perf_save_disable();
+	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
 	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
@@ -770,7 +770,7 @@ again:
 		if (!counter)
 			continue;
 
-		perf_save_and_restart(counter);
+		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
@@ -788,7 +788,7 @@ out:
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		hw_perf_restore(cpuc->throttle_ctrl);
+		intel_pmu_restore_all(cpuc->throttle_ctrl);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:10 +0200
Subject: perf_counter, x86: modify initialization of struct x86_pmu

This patch adds an error handler and changes initialization of struct
x86_pmu. No functional changes. Needed for follow-on patches.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a52d73ccfa..7c72a942363 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = {
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct x86_pmu *intel_pmu_init(void)
+static int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void)
 	unsigned int ebx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return NULL;
+		return -ENODEV;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void)
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return NULL;
+		return -ENODEV;
 
 	intel_perfmon_version = eax.split.version_id;
 	if (intel_perfmon_version < 2)
-		return NULL;
+		return -ENODEV;
 
 	pr_info("Intel Performance Monitoring support detected.\n");
 	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
+	x86_pmu = &intel_pmu;
+
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &intel_pmu;
+	return 0;
 }
 
-static struct x86_pmu *amd_pmu_init(void)
+static int amd_pmu_init(void)
 {
+	x86_pmu = &amd_pmu;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
 	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
-
-	return &amd_pmu;
+	return 0;
 }
 
 void __init init_hw_perf_counters(void)
 {
+	int err;
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		x86_pmu = intel_pmu_init();
+		err = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		x86_pmu = amd_pmu_init();
+		err = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!x86_pmu)
+	if (err != 0)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
cgit v1.2.3-70-g09d2


From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:11 +0200
Subject: perf_counter, x86: make x86_pmu data a static struct

Instead of using a pointer to reference to the x86 pmu we now have one
single data structure that is initialized at the beginning. This saves
the pointer access when using this memory.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7c72a942363..68597d76338 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -60,7 +60,7 @@ struct x86_pmu {
 	int		max_events;
 };
 
-static struct x86_pmu *x86_pmu __read_mostly;
+static struct x86_pmu x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(x86_pmu->perfctr + i);
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return x86_pmu->save_disable_all();
+	return x86_pmu.save_disable_all();
 }
 /*
  * Exported because of ACPI idle
@@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->restore_all(ctrl);
+	x86_pmu.restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
@@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->enable(idx, config);
+	x86_pmu.enable(idx, config);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->disable(idx, config);
+	x86_pmu.disable(idx, config);
 }
 
 static inline void
@@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -628,8 +628,8 @@ try_generic:
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = x86_pmu->eventsel;
-		hwc->counter_base = x86_pmu->perfctr;
+		hwc->config_base  = x86_pmu.eventsel;
+		hwc->counter_base = x86_pmu.perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -677,8 +677,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	x86_pmu->handle_irq(regs, 0);
+	x86_pmu.handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu->handle_irq(regs, 1);
+	ret = x86_pmu.handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -940,7 +940,7 @@ static int intel_pmu_init(void)
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-	x86_pmu = &intel_pmu;
+	x86_pmu = intel_pmu;
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -951,7 +951,7 @@ static int intel_pmu_init(void)
 
 static int amd_pmu_init(void)
 {
-	x86_pmu = &amd_pmu;
+	x86_pmu = amd_pmu;
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-- 
cgit v1.2.3-70-g09d2


From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:12 +0200
Subject: perf_counter, x86: move counter parameters to struct x86_pmu

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 68597d76338..75dbb1f0900 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,16 +24,7 @@
 #include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
-
-/*
- * Number of (generic) HW counters:
- */
-static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
-static u64 counter_value_mask __read_mostly;
-static int counter_value_bits __read_mostly;
-
-static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
@@ -58,6 +49,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		counter_bits;
+	u64		counter_mask;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -199,7 +194,7 @@ eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = nr_counters_generic;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -215,7 +210,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void)
 	 */
 	barrier();
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	if (!ctrl)
 		return;
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
-			     (u64)(-left) & counter_value_mask);
+			     (u64)(-left) & x86_pmu.counter_mask);
 }
 
 static inline void
@@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		/* Try to get the previous generic counter again */
 		if (test_and_set_bit(idx, cpuc->used)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-			if (idx == nr_counters_generic)
+			idx = find_first_zero_bit(cpuc->used,
+						  x86_pmu.num_counters);
+			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
 			set_bit(idx, cpuc->used);
@@ -654,7 +650,7 @@ void perf_counter_print_debug(void)
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
-	if (!nr_counters_generic)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_disable();
@@ -676,7 +672,7 @@ void perf_counter_print_debug(void)
 	}
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -689,7 +685,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < nr_counters_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= 4,
+	.counter_bits		= 48,
+	.counter_mask		= (1ULL << 48) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -941,10 +940,10 @@ static int intel_pmu_init(void)
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	x86_pmu = intel_pmu;
-
-	nr_counters_generic = eax.split.num_counters;
-	nr_counters_fixed = edx.split.num_counters_fixed;
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_pmu.counter_bits = eax.split.bit_width;
+	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
 	return 0;
 }
@@ -952,12 +951,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-
-	nr_counters_generic = 4;
-	nr_counters_fixed = 0;
-	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-	counter_value_bits = 48;
-
 	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
@@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
-	pr_info("... num counters:    %d\n", nr_counters_generic);
-	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
-		nr_counters_generic = X86_PMC_MAX_GENERIC;
+	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_counters_generic, X86_PMC_MAX_GENERIC);
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_counters_generic) - 1;
-	perf_max_counters = nr_counters_generic;
+	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
 
-	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_counters_fixed = X86_PMC_MAX_FIXED;
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_counters_fixed, X86_PMC_MAX_FIXED);
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
-	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+	perf_counter_mask |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
-- 
cgit v1.2.3-70-g09d2


From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:13 +0200
Subject: perf_counter, x86: make pmu version generic

This makes the use of the version variable generic. Also, some debug
messages have been generalized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75dbb1f0900..15d2c03e16f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -39,6 +39,8 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	const char	*name;
+	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
@@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
-static __read_mostly int intel_perfmon_version;
-
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -658,7 +658,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (intel_perfmon_version >= 2) {
+	if (x86_pmu.version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
@@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
@@ -918,6 +920,7 @@ static int intel_pmu_init(void)
 	union cpuid10_eax eax;
 	unsigned int unused;
 	unsigned int ebx;
+	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return -ENODEV;
@@ -930,16 +933,12 @@ static int intel_pmu_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return -ENODEV;
 
-	intel_perfmon_version = eax.split.version_id;
-	if (intel_perfmon_version < 2)
+	version = eax.split.version_id;
+	if (version < 2)
 		return -ENODEV;
 
-	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", intel_perfmon_version);
-	pr_info("... bit width:       %d\n", eax.split.bit_width);
-	pr_info("... mask length:     %d\n", eax.split.mask_length);
-
 	x86_pmu = intel_pmu;
+	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
 	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
 	x86_pmu.counter_bits = eax.split.bit_width;
@@ -951,7 +950,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
 
@@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
+	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
+	pr_info("... version:         %d\n", x86_pmu.version);
+	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+
 	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-- 
cgit v1.2.3-70-g09d2


From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:14 +0200
Subject: perf_counter, x86: make x86_pmu_read() static inline

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 15d2c03e16f..3f3ae477a7d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_pmu_read(struct perf_counter *counter)
+static inline void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
-- 
cgit v1.2.3-70-g09d2


From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:15 +0200
Subject: perf_counter, x86: rename cpuc->active_mask

This is to have a consistent naming scheme with cpuc->used.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f3ae477a7d..9ec51a662db 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active_mask);
+	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, cpuc->active_mask);
+	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
cgit v1.2.3-70-g09d2


From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:16 +0200
Subject: perf_counter, x86: generic use of cpuc->active

cpuc->active will now be used to indicate an enabled counter which
implies also valid pointers of cpuc->counters[]. In contrast,
cpuc->used only locks the counter, but it can be still uninitialized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9ec51a662db..f7fd4a35515 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config)
 
 static void amd_pmu_disable_counter(int idx, u64 config)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
@@ -633,10 +629,7 @@ try_generic:
 	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	/*
-	 * Make it visible before enabling the hw:
-	 */
-	barrier();
+	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
@@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
+	/*
+	 * Must be done before we disable, otherwise the nmi handler
+	 * could reenable again:
+	 */
+	clear_bit(idx, cpuc->active);
 	__x86_pmu_disable(counter, hwc, idx);
 
-	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
 	/*
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
@@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * that we are disabling:
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
+	cpuc->counters[idx] = NULL;
+	clear_bit(idx, cpuc->used);
 }
 
 /*
@@ -763,7 +761,7 @@ again:
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!counter)
+		if (!test_bit(bit, cpuc->active))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
-- 
cgit v1.2.3-70-g09d2


From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:17 +0200
Subject: perf_counter, x86: consistent use of type int for counter index

The type of counter index is sometimes implemented as unsigned
int. This patch changes this to have a consistent usage of int.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 include/linux/perf_counter.h       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7fd4a35515..d8beebeb270 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config)
 
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, unsigned int __idx)
+		    struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter,
 
 static inline void
 __x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, unsigned int idx)
+		  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 static inline void
 __pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, unsigned int __idx)
+		   struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned int idx = hwc->idx;
+	int idx = hwc->idx;
 
 	/*
 	 * Must be done before we disable, otherwise the nmi handler
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c3db52dc876..41aed427005 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -318,7 +318,7 @@ struct hw_perf_counter {
 			unsigned long			config_base;
 			unsigned long			counter_base;
 			int				nmi;
-			unsigned int			idx;
+			int				idx;
 		};
 		union { /* software */
 			atomic64_t			count;
-- 
cgit v1.2.3-70-g09d2


From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:18 +0200
Subject: perf_counter, x86: rework counter enable functions

There is vendor specific code in generic x86 code, and there is vendor
specific code that could be generic. This patch introduces
x86_pmu_enable_counter() for x86 generic code. Fixed counter code for
Intel is moved to Intel only functions. In the end, checks and calls
via function pointers were reduced to the necessary. Also, the
internal function i/f changed.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d8beebeb270..ae55933ce79 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,7 +44,7 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*enable)(int, u64);
+	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
@@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void intel_pmu_enable_counter(int idx, u64 config)
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
-			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static void amd_pmu_enable_counter(int idx, u64 config)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-}
+	int err;
 
-static void hw_perf_enable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.enable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, int __idx)
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void
-__x86_pmu_enable(struct perf_counter *counter,
-		 struct hw_perf_counter *hwc, int idx)
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_enable(counter, hwc, idx);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_counter(hwc, idx);
 	else
-		hw_perf_enable(idx, hwc->config);
+		amd_pmu_disable_counter(idx, hwc->config);
 }
 
 static int
@@ -632,7 +628,7 @@ try_generic:
 	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
-	__x86_pmu_enable(counter, hwc, idx);
+	x86_pmu.enable(hwc, idx);
 
 	return 0;
 }
@@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__x86_pmu_enable(counter, hwc, idx);
+		intel_pmu_enable_counter(hwc, idx);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:19 +0200
Subject: perf_counter, x86: rework counter disable functions

As for the enable function, this patch reworks the disable functions
and introduces x86_pmu_disable_counter(). The internal function i/f in
struct x86_pmu changed too.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ae55933ce79..df9012bbd21 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -45,7 +45,7 @@ struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	void		(*enable)(struct hw_perf_counter *, int);
-	void		(*disable)(int, u64);
+	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void intel_pmu_disable_counter(int idx, u64 config)
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
-}
-
-static void amd_pmu_disable_counter(int idx, u64 config)
-{
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-
-}
+	int err;
 
-static void hw_perf_disable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.disable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config);
 }
 
 static inline void
-__pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, int __idx)
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, int idx)
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_disable(counter, hwc, idx);
-	else
-		hw_perf_disable(idx, hwc->config);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	x86_pmu_disable_counter(hwc, idx);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 	if (cpuc->enabled)
 		x86_pmu_enable_counter(hwc, idx);
 	else
-		amd_pmu_disable_counter(idx, hwc->config);
+		x86_pmu_disable_counter(hwc, idx);
 }
 
 static int
@@ -622,7 +620,7 @@ try_generic:
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	set_bit(idx, cpuc->active);
@@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * could reenable again:
 	 */
 	clear_bit(idx, cpuc->active);
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	/*
 	 * Make sure the cleared pointer becomes visible before we
@@ -762,7 +760,7 @@ again:
 
 		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__x86_pmu_disable(counter, &counter->hw, bit);
+			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
 	intel_pmu_ack_status(ack);
-- 
cgit v1.2.3-70-g09d2


From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:20 +0200
Subject: perf_counter, x86: change and remove pmu initialization checks

Some functions are only called if the pmu was proper initialized. That
initalization checks can be removed. The way to check initialization
changed too. Now, the pointer to the interrupt handler is checked. If
it exists the pmu is initialized. This also removes a static variable
and uses struct x86_pmu as only data source for the check.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index df9012bbd21..2d3681bbb52 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -23,7 +23,6 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
-static bool perf_counters_initialized __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
@@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return -ENOSYS;
 
-	if (unlikely(!perf_counters_initialized))
-		return -EINVAL;
+	if (!x86_pmu_initialized())
+		return -ENODEV;
 
 	err = 0;
 	if (atomic_inc_not_zero(&num_counters)) {
@@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void)
 
 u64 hw_perf_save_disable(void)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return 0;
-
 	return x86_pmu.save_disable_all();
 }
 /*
@@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl)
 
 void hw_perf_restore(u64 ctrl)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return;
-
 	x86_pmu.restore_all(ctrl);
 }
 /*
@@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
 	return status;
@@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack)
 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
@@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config);
 }
@@ -787,10 +779,10 @@ void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+	if (!x86_pmu_initialized())
 		return;
 
-	if (unlikely(!perf_counters_initialized))
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-	if (!perf_counters_initialized)
+	if (!x86_pmu_initialized())
 		return;
+
 	/*
 	 * Enable the performance counter vector in the APIC LVT:
 	 */
@@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void)
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-- 
cgit v1.2.3-70-g09d2


From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:21 +0200
Subject: perf_counter, x86: implement the interrupt handler for AMD cpus

This patch implements the interrupt handler for AMD performance
counters. In difference to the Intel pmu, there is no single status
register and also there are no fixed counters. This makes the handler
very different and it is useful to make the handler vendor
specific. To check if a counter is overflowed the upper bit of the
counter is checked. Only counters where the active bit is set are
checked.

With this patch throttling is enabled for AMD performance counters.

This patch also reenables Linux performance counters on AMD cpus.

[ Impact: re-enable perfcounters on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2d3681bbb52..f4d59d4cf3f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
-	/* disable temporarily */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return -ENOSYS;
-
 	if (!x86_pmu_initialized())
 		return -ENODEV;
 
@@ -773,7 +769,43 @@ out:
 	return ret;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+{
+	int cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	u64 val;
+	int handled = 0;
+	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
+	int idx;
+
+	++cpuc->interrupts;
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active))
+			continue;
+		counter = cpuc->counters[idx];
+		hwc = &counter->hw;
+		x86_perf_counter_update(counter, hwc, idx);
+		val = atomic64_read(&hwc->prev_count);
+		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+			continue;
+		/* counter overflow */
+		x86_perf_counter_set_period(counter, hwc, idx);
+		handled = 1;
+		inc_irq_stat(apic_perf_irqs);
+		if (perf_counter_overflow(counter, nmi, regs, 0))
+			amd_pmu_disable_counter(hwc, idx);
+		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			/*
+			 * do not reenable when throttled, but reload
+			 * the register
+			 */
+			amd_pmu_disable_counter(hwc, idx);
+		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+			amd_pmu_enable_counter(hwc, idx);
+	}
+	return handled;
+}
 
 void perf_counter_unthrottle(void)
 {
@@ -782,9 +814,6 @@ void perf_counter_unthrottle(void)
 	if (!x86_pmu_initialized())
 		return;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-- 
cgit v1.2.3-70-g09d2


From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:22 +0200
Subject: perf_counter, x86: return raw count with x86_perf_counter_update()

To check on AMD cpus if a counter overflows, the upper bit of the raw
counter value must be checked. This value is already internally
available in x86_perf_counter_update(). Now, the value is returned so
that it can be used directly to check for overflows.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f4d59d4cf3f..a8a53abd706 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event)
  * Can only be executed on the CPU where the counter is active.
  * Returns the delta events processed.
  */
-static void
+static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
@@ -165,6 +165,8 @@ again:
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
 }
 
 static atomic_t num_counters;
@@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-		x86_perf_counter_update(counter, hwc, idx);
-		val = atomic64_read(&hwc->prev_count);
+		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 		/* counter overflow */
-- 
cgit v1.2.3-70-g09d2


From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:23 +0200
Subject: perf_counter, x86: introduce max_period variable

In x86 pmus the allowed counter period to programm differs. This
introduces a max_period value and allows the generic implementation
for all models to check the max period.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a8a53abd706..4b8715b34f8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -54,6 +54,7 @@ struct x86_pmu {
 	int		num_counters_fixed;
 	int		counter_bits;
 	u64		counter_mask;
+	u64		max_period;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic counter period:
-	 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-			hwc->irq_period = 0x7FFFFFFF;
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
+		hwc->irq_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = {
 	.num_counters		= 4,
 	.counter_bits		= 48,
 	.counter_mask		= (1ULL << 48) - 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = x86_pmu.num_counters;
 
 	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-- 
cgit v1.2.3-70-g09d2


From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:24 +0200
Subject: perf_counter, x86: remove vendor check in fixed_mode_idx()

The function fixed_mode_idx() is used generically. Now it checks the
num_counters_fixed value instead of the vendor to decide if fixed
counters are present.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 4b8715b34f8..d1c8036dcbd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
 	if (unlikely(hwc->nmi))
-- 
cgit v1.2.3-70-g09d2


From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:25 +0200
Subject: perf_counter, x86: remove unused function argument in
 intel_pmu_get_status()

The mask argument is unused and thus can be removed.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d1c8036dcbd..856b0b85219 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static inline u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
 
@@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (!status)
 		goto out;
 
@@ -753,7 +753,7 @@ again:
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (status)
 		goto again;
 out:
-- 
cgit v1.2.3-70-g09d2


From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 14:52:50 +0200
Subject: perf_counter: add/update copyrights

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++----
 kernel/perf_counter.c              | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 856b0b85219..47e563bfd4c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1,10 +1,11 @@
 /*
  * Performance counter x86 architecture code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *  Copyright(C) 2009 Jaswinder Singh Rajput
- *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 582108addef..a95a171e608 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1,9 +1,9 @@
 /*
  * Performance counter core code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licensing details see kernel-base/COPYING
  */
-- 
cgit v1.2.3-70-g09d2


From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 16:55:56 +0200
Subject: perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask

Standardize on explicitly mentioning '_mask' in fields that
are not plain flags but masks. This avoids typos like:

       if (cpuc->used)

(which could easily slip through review unnoticed), while if a
typo looks like this:

       if (cpuc->used_mask)

it might get noticed during review.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 47e563bfd4c..fc06f4d3264 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
 	int			enabled;
@@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		 * Try to get the fixed counter, if that is already taken
 		 * then try to get a generic counter:
 		 */
-		if (test_and_set_bit(idx, cpuc->used))
+		if (test_and_set_bit(idx, cpuc->used_mask))
 			goto try_generic;
 
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
@@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	} else {
 		idx = hwc->idx;
 		/* Try to get the previous generic counter again */
-		if (test_and_set_bit(idx, cpuc->used)) {
+		if (test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used,
+			idx = find_first_zero_bit(cpuc->used_mask,
 						  x86_pmu.num_counters);
 			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
-			set_bit(idx, cpuc->used);
+			set_bit(idx, cpuc->used_mask);
 			hwc->idx = idx;
 		}
 		hwc->config_base  = x86_pmu.eventsel;
@@ -609,7 +609,7 @@ try_generic:
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active);
+	set_bit(idx, cpuc->active_mask);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	x86_pmu.enable(hwc, idx);
@@ -643,7 +643,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * Must be done before we disable, otherwise the nmi handler
 	 * could reenable again:
 	 */
-	clear_bit(idx, cpuc->active);
+	clear_bit(idx, cpuc->active_mask);
 	x86_pmu.disable(hwc, idx);
 
 	/*
@@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
 	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used);
+	clear_bit(idx, cpuc->used_mask);
 }
 
 /*
@@ -741,7 +741,7 @@ again:
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!test_bit(bit, cpuc->active))
+		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
@@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-- 
cgit v1.2.3-70-g09d2


From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:17 +0200
Subject: perf_counter: fix nmi-watchdog interaction

When we don't have any perf-counters active, don't act like we know
what the NMI is for.

[ Impact: fix hard hang with nmi_watchdog=2 ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.109867793@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc06f4d3264..d4c0cc9d326 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
+	if (!atomic_read(&num_counters))
+		return NOTIFY_DONE;
+
 	switch (cmd) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-- 
cgit v1.2.3-70-g09d2


From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 May 2009 18:47:44 +0200
Subject: perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo

Invert the atomic_inc_not_zero() test so that we will indeed detect the
first activation.

Also rename the global num_counters, since its easy to confuse with
x86_pmu.num_counters.

[ Impact: fix non-working perfcounters on AMD CPUs, cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241455664.7620.4938.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4c0cc9d326..196b58f0444 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -171,7 +171,7 @@ again:
 	return new_raw_count;
 }
 
-static atomic_t num_counters;
+static atomic_t active_counters;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
@@ -224,7 +224,7 @@ static void release_pmc_hardware(void)
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
-	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
@@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -ENODEV;
 
 	err = 0;
-	if (atomic_inc_not_zero(&num_counters)) {
+	if (!atomic_inc_not_zero(&active_counters)) {
 		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
 			err = -EBUSY;
 		else
-			atomic_inc(&num_counters);
+			atomic_inc(&active_counters);
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 	if (err)
@@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
 		hwc->nmi = 1;
 
-	hwc->irq_period		= hw_event->irq_period;
+	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
 		hwc->irq_period = x86_pmu.max_period;
 
@@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
-	if (!atomic_read(&num_counters))
+	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
 
 	switch (cmd) {
-- 
cgit v1.2.3-70-g09d2


From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:04:09 +0200
Subject: perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON

Fixed-purpose counters stopped working in a simple 'perf stat ls' run:

   <not counted>  cache references
   <not counted>  cache misses

Due to:

  ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx()

Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the
fixed-purpose counters are utilized.

But on v2 perfmon this field is not set (despite there being
fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose
counters to at least three.

[ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ]

Cc: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 196b58f0444..a6878b0798e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -962,7 +962,13 @@ static int intel_pmu_init(void)
 	x86_pmu = intel_pmu;
 	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
-	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
+	 * assume at least 3 counters:
+	 */
+	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
-- 
cgit v1.2.3-70-g09d2


From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 10 May 2009 10:53:05 +0200
Subject: perf_counter, x86: clean up throttling printk

s/PERFMON/perfcounters for perfcounter interrupt throttling warning.

'perfmon' is the CPU feature name that is Intel-only, while we do
throttling in a generic way.

[ Impact: cleanup ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a6878b0798e..da27419923a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -814,7 +814,7 @@ void perf_counter_unthrottle(void)
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
+			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
 		hw_perf_restore(cpuc->throttle_ctrl);
 	}
 	cpuc->interrupts = 0;
-- 
cgit v1.2.3-70-g09d2


From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 08:12:51 +0200
Subject: perf_counter: fix print debug irq disable

inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (sysrq_key_table_lock){?.....},

Don't unconditionally enable interrupts in the perf_counter_print_debug()
path.

[ Impact: fix potential deadlock pointed out by lockdep ]

LKML-Reference: <new-submission>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da27419923a..f7772ff7936 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -621,12 +621,13 @@ void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
 	int cpu, idx;
 
 	if (!x86_pmu.num_counters)
 		return;
 
-	local_irq_disable();
+	local_irq_save(flags);
 
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -664,7 +665,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 static void x86_pmu_disable(struct perf_counter *counter)
-- 
cgit v1.2.3-70-g09d2


From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 09:45:19 +0200
Subject: perf_counter: x86: More accurate counter update

Take the counter width into account instead of assuming 32 bits.

In particular Nehalem has 44 bit wide counters, and all
arithmetics should happen on a 44-bit signed integer basis.

[ Impact: fix rare event imprecision, warning message on Nehalem ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7772ff7936..3a92a2b2a80 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -138,7 +138,9 @@ static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
-	u64 prev_raw_count, new_raw_count, delta;
+	int shift = 64 - x86_pmu.counter_bits;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
 
 	/*
 	 * Careful: an NMI might modify the previous counter value.
@@ -161,9 +163,10 @@ again:
 	 * (counter-)time and add that to the generic counter.
 	 *
 	 * Careful, not all hw sign-extends above the physical width
-	 * of the count, so we do that by clipping the delta to 32 bits:
+	 * of the count.
 	 */
-	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
-- 
cgit v1.2.3-70-g09d2


From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 May 2009 12:54:01 +0200
Subject: perf_counter: x86: Fix throttling

If counters are disabled globally when a perfcounter IRQ/NMI hits,
and if we throttle in that case, we'll promote the '0' value to
the next lapic IRQ and disable all perfcounters at that point,
permanently ...

Fix it.

[ Impact: fix hung perfcounters under load ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3a92a2b2a80..88ae8cebf3c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -765,8 +765,13 @@ out:
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		intel_pmu_restore_all(cpuc->throttle_ctrl);
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
+			intel_pmu_restore_all(cpuc->throttle_ctrl);
+		} else {
+			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
+		}
+	}
 
 	return ret;
 }
@@ -817,11 +822,16 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
+		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
+
+		/*
+		 * Clear them before re-enabling irqs/NMIs again:
+		 */
+		cpuc->interrupts = 0;
 		hw_perf_restore(cpuc->throttle_ctrl);
+	} else {
+		cpuc->interrupts = 0;
 	}
-	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
cgit v1.2.3-70-g09d2


From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 10:02:57 +0200
Subject: perf_counter: x86: Allow unpriviliged use of NMIs

Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter
creation instead of silently down-grading to regular interrupts.

[ Impact: allow wider perf-counter usage ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 88ae8cebf3c..c19e927b697 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * If privileged enough, allow NMI events:
 	 */
 	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+	if (hw_event->nmi) {
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
 		hwc->nmi = 1;
+	}
 
 	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-- 
cgit v1.2.3-70-g09d2


From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 13:21:36 +0200
Subject: perf_counter: x86: Fix up the amd NMI/INT throttle

perf_counter_unthrottle() restores throttle_ctrl, buts its never set.
Also, we fail to disable all counters when throttling.

[ Impact: fix rare stuck perf-counters when they are throttled ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c19e927b697..7601c014f8f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
+	if (!enabled)
+		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void)
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
+out:
 	return enabled;
 }
 
@@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int handled = 0;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx;
+	int idx, throttle = 0;
+
+	cpuc->throttle_ctrl = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
+
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			throttle = 1;
+	}
 
-	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int disable = 0;
+
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
+
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
+			goto next;
+
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		if (perf_counter_overflow(counter, nmi, regs, 0))
-			amd_pmu_disable_counter(hwc, idx);
-		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			/*
-			 * do not reenable when throttled, but reload
-			 * the register
-			 */
+		disable = perf_counter_overflow(counter, nmi, regs, 0);
+
+next:
+		if (disable || throttle)
 			amd_pmu_disable_counter(hwc, idx);
-		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-			amd_pmu_enable_counter(hwc, idx);
 	}
+
+	if (cpuc->throttle_ctrl && !throttle)
+		cpuc->enabled = 1;
+
 	return handled;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 16:21:38 +0200
Subject: perf_counter: Rework the perf counter disable/enable

The current disable/enable mechanism is:

	token = hw_perf_save_disable();
	...
	/* do bits */
	...
	hw_perf_restore(token);

This works well, provided that the use nests properly. Except we don't.

x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore
provide a reference counter disable/enable interface, where the first disable
disables the hardware, and the last enable enables the hardware again.

[ Impact: refactor, simplify the PMU disable/enable logic ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  24 ++++----
 arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++-----------------------
 drivers/acpi/processor_idle.c      |   6 +-
 include/linux/perf_counter.h       |  10 ++--
 kernel/perf_counter.c              |  76 +++++++++++++++----------
 5 files changed, 109 insertions(+), 120 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 15cdc8e6722..bb1b463c136 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long ret;
@@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void)
 		mb();
 	}
 	local_irq_restore(flags);
-	return ret;
 }
 
 /*
@@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void)
  * If we were previously disabled and counters were added, then
  * put the new config on the PMU.
  */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
 {
 	struct perf_counter *counter;
 	struct cpu_hw_counters *cpuhw;
@@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable)
 	int n_lim;
 	int idx;
 
-	if (disable)
-		return;
 	local_irq_save(flags);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
@@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 /*
  * Add a counter to the PMU.
  * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
 static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
-	u64 pmudis;
 	int n0;
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * Add the counter to the list (if there is room)
@@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter)
 
 	ret = 0;
  out:
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 	return ret;
 }
@@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
-	u64 pmudis;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	power_pmu_read(counter);
 
@@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7601c014f8f..313638cecbb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -31,7 +31,6 @@ struct cpu_hw_counters {
 	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			throttle_ctrl;
 	int			enabled;
 };
 
@@ -42,8 +41,8 @@ struct x86_pmu {
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
-	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
@@ -56,6 +55,7 @@ struct x86_pmu {
 	int		counter_bits;
 	u64		counter_mask;
 	u64		max_period;
+	u64		intel_ctrl;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 intel_pmu_save_disable_all(void)
+static void intel_pmu_disable_all(void)
 {
-	u64 ctrl;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	return ctrl;
 }
 
-static u64 amd_pmu_save_disable_all(void)
+static void amd_pmu_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int enabled, idx;
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
 
-	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
@@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
-	if (!enabled)
-		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void)
 		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
-
-out:
-	return enabled;
 }
 
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	if (!x86_pmu_initialized())
-		return 0;
-	return x86_pmu.save_disable_all();
+		return;
+	return x86_pmu.disable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void intel_pmu_restore_all(u64 ctrl)
+static void intel_pmu_enable_all(void)
 {
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 }
 
-static void amd_pmu_restore_all(u64 ctrl)
+static void amd_pmu_enable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
-	cpuc->enabled = ctrl;
-	barrier();
-	if (!ctrl)
+	if (cpuc->enabled)
 		return;
 
+	cpuc->enabled = 1;
+	barrier();
+
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
@@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl)
 	}
 }
 
-void hw_perf_restore(u64 ctrl)
+void hw_perf_enable(void)
 {
 	if (!x86_pmu_initialized())
 		return;
-	x86_pmu.restore_all(ctrl);
+	x86_pmu.enable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline u64 intel_pmu_get_status(void)
 {
@@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	int ret = 0;
-
-	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
+	perf_disable();
 	status = intel_pmu_get_status();
-	if (!status)
-		goto out;
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
 
-	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -767,19 +751,11 @@ again:
 	status = intel_pmu_get_status();
 	if (status)
 		goto again;
-out:
-	/*
-	 * Restore - do not reenable when global enable is off or throttled:
-	 */
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
-			intel_pmu_restore_all(cpuc->throttle_ctrl);
-		} else {
-			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
-		}
-	}
 
-	return ret;
+	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
+		perf_enable();
+
+	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
@@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct hw_perf_counter *hwc;
 	int idx, throttle = 0;
 
-	cpuc->throttle_ctrl = cpuc->enabled;
-	cpuc->enabled = 0;
-	barrier();
-
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			throttle = 1;
+	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
+		throttle = 1;
+		__perf_disable();
+		cpuc->enabled = 0;
+		barrier();
 	}
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -824,9 +798,6 @@ next:
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
-	if (cpuc->throttle_ctrl && !throttle)
-		cpuc->enabled = 1;
-
 	return handled;
 }
 
@@ -839,13 +810,11 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
-
 		/*
 		 * Clear them before re-enabling irqs/NMIs again:
 		 */
 		cpuc->interrupts = 0;
-		hw_perf_restore(cpuc->throttle_ctrl);
+		perf_enable();
 	} else {
 		cpuc->interrupts = 0;
 	}
@@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
-	.save_disable_all	= intel_pmu_save_disable_all,
-	.restore_all		= intel_pmu_restore_all,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = {
 static struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
-	.save_disable_all	= amd_pmu_save_disable_all,
-	.restore_all		= amd_pmu_restore_all,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
@@ -1003,6 +972,8 @@ static int intel_pmu_init(void)
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d2830f39d46..9645758c047 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -763,11 +763,9 @@ static int acpi_idle_bm_check(void)
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-	u64 perf_flags;
-
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -782,7 +780,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 614f921d616..e543ecc129f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
-extern u64 hw_perf_save_disable(void);
-extern void hw_perf_restore(u64 ctrl);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)				{ }
-static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline void perf_disable(void)					{ }
+static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 985be0b662a..e814ff04d7c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte
 	return NULL;
 }
 
-u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
+void __weak hw_perf_disable(void)		{ barrier(); }
+void __weak hw_perf_enable(void)		{ barrier(); }
+
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
@@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)	{ }
 
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+	__get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+	return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+	__perf_disable();
+	hw_perf_disable();
+}
+EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */
+
+void perf_enable(void)
+{
+	if (__perf_enable())
+		hw_perf_enable();
+}
+EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
-	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
-	u64 perf_flags;
 	int err;
 
 	/*
@@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	add_counter_to_ctx(counter, ctx);
 
@@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info)
 		cpuctx->max_pertask--;
 
  unlock:
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
@@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
-	unsigned long pmuflags;
 	unsigned long flags;
 	int err;
 
@@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info)
 	if (!group_can_go_on(counter, cpuctx, 1)) {
 		err = -EEXIST;
 	} else {
-		pmuflags = hw_perf_save_disable();
+		perf_disable();
 		if (counter == leader)
 			err = group_sched_in(counter, cpuctx, ctx,
 					     smp_processor_id());
 		else
 			err = counter_sched_in(counter, cpuctx, ctx,
 					       smp_processor_id());
-		hw_perf_restore(pmuflags);
+		perf_enable();
 	}
 
 	if (err) {
@@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
-	u64 flags;
 
 	spin_lock(&ctx->lock);
 	ctx->is_active = 0;
@@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 		goto out;
 	update_context_time(ctx);
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
-	u64 flags;
 	int can_add_hw = 1;
 
 	spin_lock(&ctx->lock);
@@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	ctx->timestamp = perf_clock();
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * First go through the list and put on any pinned groups
@@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 				can_add_hw = 0;
 		}
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -955,7 +977,6 @@ int perf_counter_task_disable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
@@ -969,7 +990,7 @@ int perf_counter_task_disable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state != PERF_COUNTER_STATE_ERROR) {
@@ -978,7 +999,7 @@ int perf_counter_task_disable(void)
 		}
 	}
 
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 
@@ -991,7 +1012,6 @@ int perf_counter_task_enable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
@@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state > PERF_COUNTER_STATE_OFF)
@@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void)
 			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 
@@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void)
 static void rotate_ctx(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 perf_flags;
 
 	if (!ctx->nr_counters)
 		return;
@@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_move_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 }
@@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child,
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
-		u64 perf_flags;
 
 		/*
 		 * Disable and unlink this counter.
@@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * could still be processing it:
 		 */
 		local_irq_save(flags);
-		perf_flags = hw_perf_save_disable();
+		perf_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
@@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		child_ctx->nr_counters--;
 
-		hw_perf_restore(perf_flags);
+		perf_enable();
 		local_irq_restore(flags);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 May 2009 14:52:17 +0200
Subject: perf_counter: x86: Robustify interrupt handling

Two consecutive NMIs could daze and confuse the machine when the
first would handle the overflow of both counters.

[ Impact: fix false-positive syslog messages under multi-session profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 313638cecbb..1dcf67057f1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
+
+		if (counter->hw_event.nmi != nmi)
+			goto next;
+
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			goto next;
@@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
-	int ret;
 
 	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
@@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu.handle_irq(regs, 1);
+	/*
+	 * Can't rely on the handled return value to say it was our NMI, two
+	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+	 *
+	 * If the first NMI handles both, the latter will be empty and daze
+	 * the CPU.
+	 */
+	x86_pmu.handle_irq(regs, 1);
 
-	return ret ? NOTIFY_STOP : NOTIFY_OK;
+	return NOTIFY_STOP;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-- 
cgit v1.2.3-70-g09d2


From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:25:22 +0200
Subject: perf_counter: x86: Disallow interval of 1

On certain CPUs i have observed a stuck PMU if interval was set to
1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS,
but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL,
and the NMI loop got stuck infinitely.

[ Impact: fix rare hangs during high perfcounter load ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1dcf67057f1..46a82d1e4cb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 		left += period;
 		atomic64_set(&hwc->period_left, left);
 	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
 
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
-- 
cgit v1.2.3-70-g09d2


From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:26:20 +0200
Subject: perf_counter: x86: Protect against infinite loops in
 intel_pmu_handle_irq()

intel_pmu_handle_irq() can lock up in an infinite loop if the hardware
does not allow the acking of irqs. Alas, this happened in testing so
make this robust and emit a warning if it happens in the future.

Also, clean up the IRQ handlers a bit.

[ Impact: improve perfcounter irq/nmi handling robustness ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46a82d1e4cb..5a7f718eb1e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	struct cpu_hw_counters;
+	int bit, cpu, loops;
 	u64 ack, status;
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	perf_disable();
 	status = intel_pmu_get_status();
@@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		return 0;
 	}
 
+	loops = 0;
 again:
+	if (++loops > 100) {
+		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		return 1;
+	}
+
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
@@ -765,13 +775,14 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu = smp_processor_id();
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	u64 val;
-	int handled = 0;
+	int cpu, idx, throttle = 0, handled = 0;
+	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx, throttle = 0;
+	u64 val;
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
 		throttle = 1;
-- 
cgit v1.2.3-70-g09d2


From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:28 +0200
Subject: perf_counter: frequency based adaptive irq_period

Instead of specifying the irq_period for a counter, provide a target interrupt
frequency and dynamically adapt the irq_period to match this frequency.

[ Impact: new perf-counter attribute/feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.646195868@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 13 ++++----
 arch/x86/kernel/cpu/perf_counter.c |  9 ++----
 include/linux/perf_counter.h       | 10 ++++--
 kernel/perf_counter.c              | 63 ++++++++++++++++++++++++++++++--------
 4 files changed, 68 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb1b463c136..db8d5cafc15 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -534,7 +534,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw_event.irq_period) {
+		if (counter->hw.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if ((s64)counter->hw_event.irq_period < 0)
-		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
@@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
+	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
 
@@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	val = 0;
 	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (counter->hw_event.irq_period) {
+	if (period) {
 		if (left <= 0) {
-			left += counter->hw_event.irq_period;
+			left += period;
 			if (left <= 0)
-				left = counter->hw_event.irq_period;
+				left = period;
 			record = 1;
 		}
 		if (left < 0x80000000L)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a7f718eb1e..886dcf334bc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
-	hwc->irq_period	= hw_event->irq_period;
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-		hwc->irq_period = x86_pmu.max_period;
-
-	atomic64_set(&hwc->period_left, hwc->irq_period);
+	atomic64_set(&hwc->period_left,
+			min(x86_pmu.max_period, hwc->irq_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = min(x86_pmu.max_period, hwc->irq_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e543ecc129f..004b6e162b9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -130,7 +130,11 @@ struct perf_counter_hw_event {
 	 */
 	__u64			config;
 
-	__u64			irq_period;
+	union {
+		__u64		irq_period;
+		__u64		irq_freq;
+	};
+
 	__u32			record_type;
 	__u32			read_format;
 
@@ -146,8 +150,9 @@ struct perf_counter_hw_event {
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -337,6 +342,7 @@ struct hw_perf_counter {
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
+	u64				interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 93f4a0e4b87..0ad1db4f3d6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
+void perf_adjust_freq(struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+	u64 irq_period;
+	u64 events, period;
+	s64 delta;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+			continue;
+
+		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+			continue;
+
+		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		period = div64_u64(events, counter->hw_event.irq_freq);
+
+		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta >>= 1;
+
+		irq_period = counter->hw.irq_period + delta;
+
+		if (!irq_period)
+			irq_period = 1;
+
+		counter->hw.irq_period = irq_period;
+		counter->hw.interrupts = 0;
+	}
+	spin_unlock(&ctx->lock);
+}
+
 /*
  * Round-robin a context's counters:
  */
@@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = &curr->perf_counter_ctx;
 
+	perf_adjust_freq(&cpuctx->ctx);
+	perf_adjust_freq(ctx);
+
 	perf_counter_cpu_sched_out(cpuctx);
 	__perf_counter_task_sched_out(ctx);
 
@@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->hw.interrupts++;
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
+	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
@@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+	period = max_t(u64, 10000, counter->hw.irq_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
 }
@@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct pmu *pmu = NULL;
-	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
 		/*
@@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		else
 			pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
@@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		break;
 	}
 
-	if (pmu)
-		hwc->irq_period = hw_event->irq_period;
-
 	return pmu;
 }
 
@@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
 	const struct pmu *pmu;
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	long err;
 
 	counter = kzalloc(sizeof(*counter), gfpflags);
@@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	pmu = NULL;
 
+	hwc = &counter->hw;
+	if (hw_event->freq && hw_event->irq_freq)
+		hwc->irq_period = TICK_NSEC / hw_event->irq_freq;
+	else
+		hwc->irq_period = hw_event->irq_period;
+
 	/*
 	 * we currently do not support PERF_RECORD_GROUP on inherited counters
 	 */
-- 
cgit v1.2.3-70-g09d2


From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 10:04:45 +0200
Subject: perf_counter, x86: fix zero irq_period counters

The quirk to irq_period unearthed an unrobustness we had in the
hw_counter initialization sequence: we left irq_period at 0, which
was then quirked up to 2 ... which then generated a _lot_ of
interrupts during 'perf stat' runs, slowed them down and skewed
the counter results in general.

Initialize irq_period to the maximum instead.

[ Impact: fix perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 886dcf334bc..5bfd30ab392 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
+	if (!hwc->irq_period)
+		hwc->irq_period = x86_pmu.max_period;
+
 	atomic64_set(&hwc->period_left,
 			min(x86_pmu.max_period, hwc->irq_period));
 
-- 
cgit v1.2.3-70-g09d2


From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 19:37:25 +0200
Subject: perf_counter, x86: speed up the scheduling fast-path

We have to set up the LVT entry only at counter init time, not at
every switch-in time.

There's friction between NMI and non-NMI use here - we'll probably
remove the per counter configurability of it - but until then, dont
slow down things ...

[ Impact: micro-optimization ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5bfd30ab392..c109819c2cb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
+	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -603,8 +604,6 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
-
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init(1);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 May 2009 20:13:28 +0200
Subject: perf_counter: Fix context removal deadlock

Disable the PMU globally before removing a counter from a
context. This fixes the following lockup:

[22081.741922] ------------[ cut here ]------------
[22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e()
[22081.755624] Hardware name: X8DTN
[22081.758903] perfcounters: irq loop stuck!
[22081.762985] Modules linked in:
[22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226
[22081.772432] Call Trace:
[22081.774940]  <NMI>  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.781993]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.788368]  [<ffffffff8104505c>] ? warn_slowpath_common+0x77/0xa3
[22081.794649]  [<ffffffff810450d3>] ? warn_slowpath_fmt+0x40/0x45
[22081.800696]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.807080]  [<ffffffff814d1a72>] ? perf_counter_nmi_handler+0x3f/0x4a
[22081.813751]  [<ffffffff814d2d09>] ? notifier_call_chain+0x58/0x86
[22081.819951]  [<ffffffff8105b250>] ? notify_die+0x2d/0x32
[22081.825392]  [<ffffffff814d1414>] ? do_nmi+0x8e/0x242
[22081.830538]  [<ffffffff814d0f0a>] ? nmi+0x1a/0x20
[22081.835342]  [<ffffffff8117e102>] ? selinux_file_free_security+0x0/0x1a
[22081.842105]  [<ffffffff81018793>] ? x86_pmu_disable_counter+0x15/0x41
[22081.848673]  <<EOE>>  [<ffffffff81018f3d>] ? x86_pmu_disable+0x86/0x103
[22081.855512]  [<ffffffff8108fedd>] ? __perf_counter_remove_from_context+0x0/0xfe
[22081.862926]  [<ffffffff8108fcbc>] ? counter_sched_out+0x30/0xce
[22081.868909]  [<ffffffff8108ff36>] ? __perf_counter_remove_from_context+0x59/0xfe
[22081.876382]  [<ffffffff8106808a>] ? smp_call_function_single+0x6c/0xe6
[22081.882955]  [<ffffffff81091b96>] ? perf_release+0x86/0x14c
[22081.888600]  [<ffffffff810c4c84>] ? __fput+0xe7/0x195
[22081.893718]  [<ffffffff810c213e>] ? filp_close+0x5b/0x62
[22081.899107]  [<ffffffff81046a70>] ? put_files_struct+0x64/0xc2
[22081.905031]  [<ffffffff8104841a>] ? do_exit+0x1e2/0x6ef
[22081.910360]  [<ffffffff814d0a60>] ? _spin_lock_irqsave+0x9/0xe
[22081.916292]  [<ffffffff8104898e>] ? do_group_exit+0x67/0x93
[22081.921953]  [<ffffffff810489cc>] ? sys_exit_group+0x12/0x16
[22081.927759]  [<ffffffff8100baab>] ? system_call_fastpath+0x16/0x1b
[22081.934076] ---[ end trace 3a3936ce3e1b4505 ]---

And could potentially also fix the lockup reported by Marcelo Tosatti.

Also, print more debug info in case of a detected lockup.

[ Impact: fix lockup ]

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  1 +
 kernel/perf_counter.c              | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c109819c2cb..6cc1660db8d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		perf_counter_print_debug();
 		return 1;
 	}
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 69d4de81596..08584c16049 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -208,18 +208,17 @@ static void __perf_counter_remove_from_context(void *info)
 		return;
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level.
+	 */
+	perf_disable();
 
 	counter_sched_out(counter, cpuctx, ctx);
 
 	counter->task = NULL;
 
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level. NOP for non NMI based counters.
-	 */
-	perf_disable();
 	list_del_counter(counter, ctx);
-	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -231,6 +230,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
+	perf_enable();
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:17:31 +1000
Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct

This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct.  The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.

This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.

The perf_counter_context structures are reference-counted and freed
when the last reference is dropped.  A context can have references
from its task and the counters on its task.  Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.

Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.

This also removes the task pointer from the perf_counter struct.  The
task pointer was not used anywhere and would make it harder to move a
context from one task to another.  Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.

The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.

We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.

Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.

This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.

[ Impact: refactor counter context management to prepare for new feature ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c  |   1 +
 include/linux/init_task.h    |  13 ---
 include/linux/perf_counter.h |   4 +-
 include/linux/sched.h        |   6 +-
 kernel/exit.c                |   3 +-
 kernel/fork.c                |   1 +
 kernel/perf_counter.c        | 218 +++++++++++++++++++++++++++----------------
 7 files changed, 145 insertions(+), 101 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e9021a90802..b4f64402a82 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
+#include <linux/perf_counter.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 503afaa0afa..d87247d2641 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,18 +108,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)					\
-	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
-	.perf_counter_ctx.event_list =					\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
-	.perf_counter_ctx.lock =					\
-		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
-#else
-# define INIT_PERF_COUNTERS(tsk)
-#endif
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -183,7 +171,6 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
-	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f612941ef46..07130900546 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -449,7 +449,6 @@ struct perf_counter {
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
-	struct task_struct		*task;
 	struct file			*filp;
 
 	struct perf_counter		*parent;
@@ -498,7 +497,6 @@ struct perf_counter {
  * Used as a container for task counters and CPU counters as well:
  */
 struct perf_counter_context {
-#ifdef CONFIG_PERF_COUNTERS
 	/*
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
@@ -516,6 +514,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	atomic_t		refcount;
 	struct task_struct	*task;
 
 	/*
@@ -523,7 +522,6 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
-#endif
 };
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff59d123151..9714d450f41 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,7 +71,6 @@ struct sched_param {
 #include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
-#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -99,6 +98,7 @@ struct robust_list_head;
 struct bio;
 struct bts_tracer;
 struct fs_struct;
+struct perf_counter_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1387,7 +1387,9 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
-	struct perf_counter_context perf_counter_ctx;
+#ifdef CONFIG_PERF_COUNTERS
+	struct perf_counter_context *perf_counter_ctxp;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9dfedd94af..99ad4063ee4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
+#include <linux/perf_counter.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
@@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
 #ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+	WARN_ON_ONCE(tsk->perf_counter_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index d32fef4d38e..e72a09f5355 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -63,6 +63,7 @@
 #include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
+#include <linux/perf_counter.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 08584c16049..06ea3eae886 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -97,6 +97,17 @@ void perf_enable(void)
 		hw_perf_enable();
 }
 
+static void get_ctx(struct perf_counter_context *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount))
+		kfree(ctx);
+}
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	ctx->nr_counters++;
 }
 
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with counter->mutex and ctx->mutex held.
+ */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
+	if (list_empty(&counter->list_entry))
+		return;
 	ctx->nr_counters--;
 
 	list_del_init(&counter->list_entry);
@@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info)
 
 	counter_sched_out(counter, cpuctx, ctx);
 
-	counter->task = NULL;
-
 	list_del_counter(counter, ctx);
 
 	if (!ctx->task) {
@@ -279,7 +294,6 @@ retry:
 	 */
 	if (!list_empty(&counter->list_entry)) {
 		list_del_counter(counter, ctx);
-		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
@@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info)
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
+	 * Or possibly this is the right context but it isn't
+	 * on this cpu because it had no counters.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	/*
@@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 		return;
 	}
 
-	counter->task = task;
 retry:
 	task_oncpu_function_call(task, __perf_install_in_context,
 				 counter);
@@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info)
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct pt_regs *regs;
 
-	if (likely(!cpuctx->task_ctx))
+	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
 	update_context_time(ctx);
@@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
+	if (!cpuctx->task_ctx)
+		return;
 	__perf_counter_sched_out(ctx, cpuctx);
 	cpuctx->task_ctx = NULL;
 }
@@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 
+	if (likely(!ctx))
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 int perf_counter_task_disable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void)
 int perf_counter_task_enable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 	int cpu;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 		return;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	ctx = &curr->perf_counter_ctx;
+	ctx = curr->perf_counter_ctxp;
 
 	perf_adjust_freq(&cpuctx->ctx);
-	perf_adjust_freq(ctx);
+	if (ctx)
+		perf_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
-	__perf_counter_task_sched_out(ctx);
+	if (ctx)
+		__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
-	perf_counter_task_sched_in(curr, cpu);
+	if (ctx)
+		perf_counter_task_sched_in(curr, cpu);
 }
 
 /*
@@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
+	atomic_set(&ctx->refcount, 1);
+	ctx->task = task;
+}
+
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
+	struct perf_counter_context *tctx;
 	struct task_struct *task;
 
 	/*
@@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	ctx = &task->perf_counter_ctx;
-	ctx->task = task;
-
 	/* Reuse ptrace permission checks for now. */
 	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_context(ctx);
+		put_task_struct(task);
 		return ERR_PTR(-EACCES);
 	}
 
+	ctx = task->perf_counter_ctxp;
+	if (!ctx) {
+		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+		if (!ctx) {
+			put_task_struct(task);
+			return ERR_PTR(-ENOMEM);
+		}
+		__perf_counter_init_context(ctx, task);
+		/*
+		 * Make sure other cpus see correct values for *ctx
+		 * once task->perf_counter_ctxp is visible to them.
+		 */
+		smp_wmb();
+		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
+		if (tctx) {
+			/*
+			 * We raced with some other task; use
+			 * the context they set.
+			 */
+			kfree(ctx);
+			ctx = tctx;
+		}
+	}
+
 	return ctx;
 }
 
@@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+	perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task)
 
 	if (!atomic_read(&nr_comm_tracking))
 		return;
-       
+	if (!current->perf_counter_ctxp)
+		return;
+
 	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
@@ -2372,7 +2444,7 @@ got_name:
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+	perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
 
 	kfree(buf);
 }
@@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
 	if (!atomic_read(&nr_mmap_tracking))
 		return;
+	if (!current->perf_counter_ctxp)
+		return;
 
 	mmap_event = (struct perf_mmap_event){
 		.file   = file,
@@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
+	get_ctx(ctx);
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -3149,21 +3224,6 @@ err_put_context:
 	goto out_fput;
 }
 
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	spin_lock_init(&ctx->lock);
-	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->task = task;
-}
-
 /*
  * inherit a counter from parent task to child task:
  */
@@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->task = child;
 	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
@@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	/*
-	 * If we do not self-reap then we have to wait for the
-	 * child task to unschedule (it will happen for sure),
-	 * so that its counter is at its final count. (This
-	 * condition triggers rarely - child tasks usually get
-	 * off their CPU before the parent has a chance to
-	 * get this far into the reaping action)
+	 * Protect against concurrent operations on child_counter
+	 * due its fd getting closed, etc.
 	 */
-	if (child != current) {
-		wait_task_inactive(child, 0);
-		update_counter_times(child_counter);
-		list_del_counter(child_counter, child_ctx);
-	} else {
-		struct perf_cpu_context *cpuctx;
-		unsigned long flags;
-
-		/*
-		 * Disable and unlink this counter.
-		 *
-		 * Be careful about zapping the list - IRQ/NMI context
-		 * could still be processing it:
-		 */
-		local_irq_save(flags);
-		perf_disable();
-
-		cpuctx = &__get_cpu_var(perf_cpu_context);
+	mutex_lock(&child_counter->mutex);
 
-		group_sched_out(child_counter, cpuctx, child_ctx);
-		update_counter_times(child_counter);
+	update_counter_times(child_counter);
+	list_del_counter(child_counter, child_ctx);
 
-		list_del_counter(child_counter, child_ctx);
-
-		perf_enable();
-		local_irq_restore(flags);
-	}
+	mutex_unlock(&child_counter->mutex);
 
 	parent_counter = child_counter->parent;
 	/*
@@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child,
  *
  * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
+ * (XXX not sure that is true when we get called from flush_old_exec.
+ *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
+	unsigned long flags;
 
 	WARN_ON_ONCE(child != current);
 
-	child_ctx = &child->perf_counter_ctx;
+	child_ctx = child->perf_counter_ctxp;
 
-	if (likely(!child_ctx->nr_counters))
+	if (likely(!child_ctx))
 		return;
 
+	local_irq_save(flags);
+	__perf_counter_task_sched_out(child_ctx);
+	child->perf_counter_ctxp = NULL;
+	local_irq_restore(flags);
+
+	mutex_lock(&child_ctx->mutex);
+
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
@@ -3371,6 +3415,10 @@ again:
 	 */
 	if (!list_empty(&child_ctx->counter_list))
 		goto again;
+
+	mutex_unlock(&child_ctx->mutex);
+
+	put_ctx(child_ctx);
 }
 
 /*
@@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 
-	child_ctx  =  &child->perf_counter_ctx;
-	parent_ctx = &parent->perf_counter_ctx;
-
-	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = NULL;
 
 	/*
 	 * This is executed from the parent task context, so inherit
-	 * counters that have been marked for cloning:
+	 * counters that have been marked for cloning.
+	 * First allocate and initialize a context for the child.
 	 */
 
-	if (likely(!parent_ctx->nr_counters))
+	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+	if (!child_ctx)
+		return;
+
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
 		return;
 
+	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = child_ctx;
+
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
-- 
cgit v1.2.3-70-g09d2


From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:03 +0200
Subject: perf_counter: x86: Expose INV and EDGE bits

Expose the INV and EDGE bits of the PMU to raw configs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.494709027@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cc1660db8d..c14437faf5d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_EDGE_MASK  |	\
+	 CORE_EVNTSEL_INV_MASK  |	\
 	 CORE_EVNTSEL_COUNTER_MASK)
 
 	return event & CORE_EVNTSEL_MASK;
@@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
 #define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
 	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_EDGE_MASK  |	\
+	 K7_EVNTSEL_INV_MASK   |	\
 	 K7_EVNTSEL_COUNTER_MASK)
 
 	return event & K7_EVNTSEL_MASK;
-- 
cgit v1.2.3-70-g09d2


From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:04 +0200
Subject: perf_counter: x86: Remove interrupt throttle

remove the x86 specific interrupt throttle

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.616671838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c        |  2 --
 arch/x86/kernel/cpu/perf_counter.c | 47 ++++----------------------------------
 include/linux/perf_counter.h       |  2 --
 3 files changed, 5 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b4f64402a82..89b63b5fad3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
-
-	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c14437faf5d..8c8177f859f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
-/*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -775,15 +770,14 @@ again:
 	if (status)
 		goto again;
 
-	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
-		perf_enable();
+	perf_enable();
 
 	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu, idx, throttle = 0, handled = 0;
+	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
@@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
-		throttle = 1;
-		__perf_disable();
-		cpuc->enabled = 0;
-		barrier();
-	}
-
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int disable = 0;
-
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		hwc = &counter->hw;
 
 		if (counter->hw_event.nmi != nmi)
-			goto next;
+			continue;
 
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			goto next;
+			continue;
 
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		disable = perf_counter_overflow(counter, nmi, regs, 0);
-
-next:
-		if (disable || throttle)
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void perf_counter_unthrottle(void)
-{
-	struct cpu_hw_counters *cpuc;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		/*
-		 * Clear them before re-enabling irqs/NMIs again:
-		 */
-		cpuc->interrupts = 0;
-		perf_enable();
-	} else {
-		cpuc->interrupts = 0;
-	}
-}
-
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d3e85de9bf1..0c160be2078 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
-extern void perf_counter_unthrottle(void);
 extern void __perf_disable(void);
 extern bool __perf_enable(void);
 extern void perf_disable(void);
@@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void perf_counter_unthrottle(void)			{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
cgit v1.2.3-70-g09d2


From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:05 +0200
Subject: perf_counter: Generic per counter interrupt throttle

Introduce a generic per counter interrupt throttle.

This uses the perf_counter_overflow() quick disable to throttle a specific
counter when its going too fast when a pmu->unthrottle() method is provided
which can undo the quick disable.

Power needs to implement both the quick disable and the unthrottle method.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++
 include/linux/perf_counter.h       | 11 +++++++
 kernel/perf_counter.c              | 59 +++++++++++++++++++++++++++++++++++---
 kernel/sysctl.c                    |  8 ++++++
 4 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8c8177f859f..c4b543d1a86 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -623,6 +623,18 @@ try_generic:
 	return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->counters[hwc->idx] != counter))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
@@ -1038,6 +1050,7 @@ static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0c160be2078..e3a7585d3e4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -266,6 +266,15 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_PERIOD		= 4,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				time;
+	 * };
+	 */
+	PERF_EVENT_THROTTLE		= 5,
+	PERF_EVENT_UNTHROTTLE		= 6,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -367,6 +376,7 @@ struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
+	void (*unthrottle)		(struct perf_counter *counter);
 };
 
 /**
@@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 extern int sysctl_perf_counter_mlock;
+extern int sysctl_perf_counter_limit;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 14b1fe98483..ec9c4007a7f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 irq_period;
+	u64 interrupts, irq_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
+		interrupts = counter->hw.interrupts;
+		counter->hw.interrupts = 0;
+
+		if (interrupts == MAX_INTERRUPTS) {
+			perf_log_throttle(counter, 1);
+			counter->pmu->unthrottle(counter);
+			interrupts = 2*sysctl_perf_counter_limit/HZ;
+		}
+
 		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
 			continue;
 
-		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		events = HZ * interrupts * counter->hw.irq_period;
 		period = div64_u64(events, counter->hw_event.irq_freq);
 
 		delta = (s64)(1 + period - counter->hw.irq_period);
@@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		perf_log_period(counter, irq_period);
 
 		counter->hw.irq_period = irq_period;
-		counter->hw.interrupts = 0;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2543,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 	perf_output_end(&handle);
 }
 
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+	} throttle_event = {
+		.header = {
+			.type = PERF_EVENT_THROTTLE + 1,
+			.misc = 0,
+			.size = sizeof(throttle_event),
+		},
+		.time = sched_clock(),
+	};
+
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, throttle_event);
+	perf_output_end(&handle);
+}
+
 /*
  * Generic counter overflow handling.
  */
@@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
+	int throttle = counter->pmu->unthrottle != NULL;
 	int ret = 0;
 
-	counter->hw.interrupts++;
+	if (!throttle) {
+		counter->hw.interrupts++;
+	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
+		counter->hw.interrupts++;
+		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+			counter->hw.interrupts = MAX_INTERRUPTS;
+			perf_log_throttle(counter, 0);
+			ret = 1;
+		}
+	}
 
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3cb1849f598..0c4bf863afa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_int_limit",
+		.data		= &sysctl_perf_counter_limit,
+		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
cgit v1.2.3-70-g09d2


From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 21:41:28 +0200
Subject: Revert "perf_counter, x86: speed up the scheduling fast-path"

This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740.

It is causing problems (stuck/stuttering profiling) - when mixed
NMI and non-NMI counters are used.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c4b543d1a86..189bf9d7cda 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
-	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -612,6 +611,8 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
+	perf_counters_lapic_init(hwc->nmi);
+
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(1);
+	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter, x86: Fix APIC NMI programming

My Nehalem box locks up in certain situations (with an
always-asserted NMI causing a lockup) if the PMU LVT
entry is programmed between NMI and IRQ mode with a
high frequency.

Standardize exlusively on NMIs instead.

[ Impact: fix lockup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 189bf9d7cda..ece3813c7a3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	/*
-	 * If privileged enough, allow NMI events:
+	 * Use NMI events all the time:
 	 */
-	hwc->nmi = 0;
-	if (hw_event->nmi) {
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		hwc->nmi = 1;
-	}
+	hwc->nmi	= 1;
+	hw_event->nmi	= 1;
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
-	if (unlikely(hwc->nmi))
-		return -1;
-
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
 	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
@@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 
-		if (counter->hw_event.nmi != nmi)
-			continue;
-
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
-- 
cgit v1.2.3-70-g09d2


From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: perf_counter, x86: Make NMI lockups more robust

We have a debug check that detects stuck NMIs and returns with
the PMU disabled in the global ctrl MSR - but i managed to trigger
a situation where this was not enough to deassert the NMI.

So clear/reset the full PMU and keep the disable count balanced when
exiting from here. This way the box produces a debug warning but
stays up and is more debuggable.

[ Impact: in case of PMU related bugs, recover more gracefully ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ece3813c7a3..2eeaa99add1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
+static void intel_pmu_reset(void)
+{
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
+
+	local_irq_restore(flags);
+}
+
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -750,6 +774,8 @@ again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
 		perf_counter_print_debug();
+		intel_pmu_reset();
+		perf_enable();
 		return 1;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 29 May 2009 13:28:35 +0800
Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt

Always use NMI for performance-monitoring interrupt as there could be
racy situations if we switch between irq and nmi mode frequently.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |  4 ++--
 arch/x86/kernel/apic/apic.c         |  2 +-
 arch/x86/kernel/cpu/perf_counter.c  | 19 +++++--------------
 3 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index d08dd52cb8f..876ed97147b 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -91,10 +91,10 @@ extern void set_perf_counter_pending(void);
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
+extern void perf_counters_lapic_init(void);
 #else
 static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
+static inline void perf_counters_lapic_init(void)	{ }
 #endif
 
 #endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 89b63b5fad3..60df2efd7c8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2eeaa99add1..316b0c995f3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -604,7 +604,7 @@ try_generic:
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
+	perf_counters_lapic_init();
 
 	x86_pmu.disable(hwc, idx);
 
@@ -863,24 +863,15 @@ void set_perf_counter_pending(void)
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
 }
 
-void perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(void)
 {
-	u32 apic_val;
-
 	if (!x86_pmu_initialized())
 		return;
 
 	/*
-	 * Enable the performance counter vector in the APIC LVT:
+	 * Always use NMI for PMU
 	 */
-	apic_val = apic_read(APIC_LVTERR);
-
-	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
-	if (nmi)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	else
-		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	apic_write(APIC_LVTERR, apic_val);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
 static int __kprobes
@@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
cgit v1.2.3-70-g09d2


From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:13:03 +0200
Subject: perf_counter: Rename various fields

A few renames:

  s/irq_period/sample_period/
  s/irq_freq/sample_freq/
  s/PERF_RECORD_/PERF_SAMPLE_/
  s/record_type/sample_type/

And change both the new sample_type and read_format to u64.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  12 ++---
 arch/x86/kernel/cpu/perf_counter.c |   8 +--
 include/linux/perf_counter.h       |  32 ++++++------
 kernel/perf_counter.c              | 104 ++++++++++++++++++-------------------
 4 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f96d55f55bd..c9633321e7a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -535,7 +535,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw.irq_period) {
+		if (counter->hw.sample_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	s64 val, left;
 	unsigned long flags;
 
-	if (!counter->hw.idx || !counter->hw.irq_period)
+	if (!counter->hw.idx || !counter->hw.sample_period)
 		return;
 	local_irq_save(flags);
 	perf_disable();
 	power_pmu_read(counter);
-	left = counter->hw.irq_period;
+	left = counter->hw.sample_period;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
 	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.irq_period)
+	    || counter->hw_event.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
-	u64 period = counter->hw.irq_period;
+	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
 	u64 addr, mmcra, sdsync;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 316b0c995f3..ec06aa5e928 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi	= 1;
 	hw_event->nmi	= 1;
 
-	if (!hwc->irq_period)
-		hwc->irq_period = x86_pmu.max_period;
+	if (!hwc->sample_period)
+		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->irq_period));
+			min(x86_pmu.max_period, hwc->sample_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->irq_period);
+	s64 period = min(x86_pmu.max_period, hwc->sample_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4845a214b9e..1fcd3cc9385 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -94,18 +94,18 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.record_type to request information
+ * Bits that can be set in hw_event.sample_type to request information
  * in the overflow packets.
  */
-enum perf_counter_record_format {
-	PERF_RECORD_IP			= 1U << 0,
-	PERF_RECORD_TID			= 1U << 1,
-	PERF_RECORD_TIME		= 1U << 2,
-	PERF_RECORD_ADDR		= 1U << 3,
-	PERF_RECORD_GROUP		= 1U << 4,
-	PERF_RECORD_CALLCHAIN		= 1U << 5,
-	PERF_RECORD_CONFIG		= 1U << 6,
-	PERF_RECORD_CPU			= 1U << 7,
+enum perf_counter_sample_format {
+	PERF_SAMPLE_IP			= 1U << 0,
+	PERF_SAMPLE_TID			= 1U << 1,
+	PERF_SAMPLE_TIME		= 1U << 2,
+	PERF_SAMPLE_ADDR		= 1U << 3,
+	PERF_SAMPLE_GROUP		= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
+	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
 /*
@@ -132,12 +132,12 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	union {
-		__u64		irq_period;
-		__u64		irq_freq;
+		__u64		sample_period;
+		__u64		sample_freq;
 	};
 
-	__u32			record_type;
-	__u32			read_format;
+	__u64			sample_type;
+	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -262,7 +262,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
-	 *	u64				irq_period;
+	 *	u64				sample_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
@@ -363,7 +363,7 @@ struct hw_perf_counter {
 		};
 	};
 	atomic64_t			prev_count;
-	u64				irq_period;
+	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 978ecfcc7aa..5ecd9981c03 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1186,7 +1186,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 interrupts, irq_period;
+	u64 interrupts, sample_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1204,23 +1204,23 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.irq_period;
-		period = div64_u64(events, counter->hw_event.irq_freq);
+		events = HZ * interrupts * counter->hw.sample_period;
+		period = div64_u64(events, counter->hw_event.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
 
-		irq_period = counter->hw.irq_period + delta;
+		sample_period = counter->hw.sample_period + delta;
 
-		if (!irq_period)
-			irq_period = 1;
+		if (!sample_period)
+			sample_period = 1;
 
-		perf_log_period(counter, irq_period);
+		perf_log_period(counter, sample_period);
 
-		counter->hw.irq_period = irq_period;
+		counter->hw.sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2297,7 +2297,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 record_type = counter->hw_event.record_type;
+	u64 sample_type = counter->hw_event.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2321,61 +2321,61 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
 	header.misc |= perf_misc_flags(regs);
 
-	if (record_type & PERF_RECORD_IP) {
+	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(regs);
-		header.type |= PERF_RECORD_IP;
+		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
-	if (record_type & PERF_RECORD_TID) {
+	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
 		tid_entry.pid = perf_counter_pid(counter, current);
 		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_RECORD_TID;
+		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
+	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_RECORD_TIME;
+		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_ADDR) {
-		header.type |= PERF_RECORD_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR) {
+		header.type |= PERF_SAMPLE_ADDR;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CONFIG) {
-		header.type |= PERF_RECORD_CONFIG;
+	if (sample_type & PERF_SAMPLE_CONFIG) {
+		header.type |= PERF_SAMPLE_CONFIG;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CPU) {
-		header.type |= PERF_RECORD_CPU;
+	if (sample_type & PERF_SAMPLE_CPU) {
+		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= PERF_RECORD_GROUP;
+	if (sample_type & PERF_SAMPLE_GROUP) {
+		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
 
-	if (record_type & PERF_RECORD_CALLCHAIN) {
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= PERF_RECORD_CALLCHAIN;
+			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -2386,28 +2386,28 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	perf_output_put(&handle, header);
 
-	if (record_type & PERF_RECORD_IP)
+	if (sample_type & PERF_SAMPLE_IP)
 		perf_output_put(&handle, ip);
 
-	if (record_type & PERF_RECORD_TID)
+	if (sample_type & PERF_SAMPLE_TID)
 		perf_output_put(&handle, tid_entry);
 
-	if (record_type & PERF_RECORD_TIME)
+	if (sample_type & PERF_SAMPLE_TIME)
 		perf_output_put(&handle, time);
 
-	if (record_type & PERF_RECORD_ADDR)
+	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (record_type & PERF_RECORD_CONFIG)
+	if (sample_type & PERF_SAMPLE_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
-	if (record_type & PERF_RECORD_CPU)
+	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
 	/*
-	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
-	if (record_type & PERF_RECORD_GROUP) {
+	if (sample_type & PERF_SAMPLE_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
 
@@ -2702,7 +2702,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- * Log irq_period changes so that analyzing tools can re-normalize the
+ * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
  */
 
@@ -2725,7 +2725,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 		.period = period,
 	};
 
-	if (counter->hw.irq_period == period)
+	if (counter->hw.sample_period == period)
 		return;
 
 	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
@@ -2834,7 +2834,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = hwc->sample_period;
 
 	if (unlikely(left <= -period)) {
 		left = period;
@@ -2874,7 +2874,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	period = max_t(u64, 10000, counter->hw.irq_period);
+	period = max_t(u64, 10000, counter->hw.sample_period);
 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
@@ -2959,7 +2959,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg && regs)
+	if (counter->hw.sample_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3080,8 +3080,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3092,7 +3092,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
@@ -3132,8 +3132,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3144,7 +3144,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
@@ -3223,7 +3223,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.irq_period = counter->hw_event.irq_period;
+	counter->hw.sample_period = counter->hw_event.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3323,15 +3323,15 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
+	if (hw_event->freq && hw_event->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
 	else
-		hwc->irq_period = hw_event->irq_period;
+		hwc->sample_period = hw_event->sample_period;
 
 	/*
-	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
 	if (perf_event_raw(hw_event)) {
-- 
cgit v1.2.3-70-g09d2


From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:27:45 +0200
Subject: perf_counter: Remove the last nmi/irq bits

IRQ (non-NMI) sampling is not used anymore - remove the last few bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 ------
 include/linux/perf_counter.h       | 4 +---
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ec06aa5e928..9e144fbebd2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	/*
-	 * Use NMI events all the time:
-	 */
-	hwc->nmi	= 1;
-	hw_event->nmi	= 1;
-
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fcd3cc9385..cef9931793f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,7 +140,6 @@ struct perf_counter_hw_event {
 	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -153,7 +152,7 @@ struct perf_counter_hw_event {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 51;
+				__reserved_1   : 52;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -354,7 +353,6 @@ struct hw_perf_counter {
 			u64				config;
 			unsigned long			config_base;
 			unsigned long			counter_base;
-			int				nmi;
 			int				idx;
 		};
 		union { /* software */
-- 
cgit v1.2.3-70-g09d2


From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:08:20 +0200
Subject: perf_counter: x86: Emulate longer sample periods

Do as Power already does, emulate sample periods up to 2^63-1 by
composing them of smaller values limited by hardware capabilities.
Only once we wrap the software period do we generate an overflow
event.

Just 10 lines of new code.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9e144fbebd2..904571bea71 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
-	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->sample_period));
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * Set the next IRQ period, based on the hwc->period_left value.
  * To be called with the counter disabled in hw:
  */
-static void
+static int
 x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->sample_period);
-	int err;
+	s64 period = hwc->sample_period;
+	int err, ret = 0;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 	/*
 	 * Quirk: certain CPUs dont like it if just 1 event is left:
@@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left < 2))
 		left = 2;
 
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
@@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
 			     (u64)(-left) & x86_pmu.counter_mask);
+
+	return ret;
 }
 
 static inline void
@@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void intel_pmu_save_and_restart(struct perf_counter *counter)
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	int ret;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	x86_perf_counter_set_period(counter, hwc, idx);
+	ret = x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		intel_pmu_enable_counter(hwc, idx);
+
+	return ret;
 }
 
 static void intel_pmu_reset(void)
@@ -782,7 +791,9 @@ again:
 		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
-		intel_pmu_save_and_restart(counter);
+		if (!intel_pmu_save_and_restart(counter))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
@@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 
 		/* counter overflow */
-		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
+		if (!x86_perf_counter_set_period(counter, hwc, idx))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
-- 
cgit v1.2.3-70-g09d2


From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 19:22:16 +0200
Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr

The structure isn't hw only and when I read event, I think about those
things that fall out the other end. Rename the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  38 ++++++------
 arch/x86/kernel/cpu/perf_counter.c |  16 ++---
 include/linux/perf_counter.h       |  34 +++++------
 include/linux/syscalls.h           |   4 +-
 kernel/perf_counter.c              | 116 ++++++++++++++++++-------------------
 5 files changed, 104 insertions(+), 104 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9633321e7a..ea54686cb78 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 		}
 		counter = ctrs[i];
 		if (first) {
-			eu = counter->hw_event.exclude_user;
-			ek = counter->hw_event.exclude_kernel;
-			eh = counter->hw_event.exclude_hv;
+			eu = counter->attr.exclude_user;
+			ek = counter->attr.exclude_kernel;
+			eh = counter->attr.exclude_hv;
 			first = 0;
-		} else if (counter->hw_event.exclude_user != eu ||
-			   counter->hw_event.exclude_kernel != ek ||
-			   counter->hw_event.exclude_hv != eh) {
+		} else if (counter->attr.exclude_user != eu ||
+			   counter->attr.exclude_kernel != ek ||
+			   counter->attr.exclude_hv != eh) {
 			return -EAGAIN;
 		}
 	}
@@ -483,16 +483,16 @@ void hw_perf_enable(void)
 
 	/*
 	 * Add in MMCR0 freeze bits corresponding to the
-	 * hw_event.exclude_* bits for the first counter.
+	 * attr.exclude_* bits for the first counter.
 	 * We have already checked that all counters have the
 	 * same values for these bits as the first counter.
 	 */
 	counter = cpuhw->counter[0];
-	if (counter->hw_event.exclude_user)
+	if (counter->attr.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->hw_event.exclude_kernel)
+	if (counter->attr.exclude_kernel)
 		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->hw_event.exclude_hv)
+	if (counter->attr.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
 	/*
@@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	int n;
 	u64 alt[MAX_EVENT_ALTERNATIVES];
 
-	if (counter->hw_event.exclude_user
-	    || counter->hw_event.exclude_kernel
-	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.sample_period)
+	if (counter->attr.exclude_user
+	    || counter->attr.exclude_kernel
+	    || counter->attr.exclude_hv
+	    || counter->attr.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->hw_event)) {
-		ev = perf_event_id(&counter->hw_event);
+	if (!perf_event_raw(&counter->attr)) {
+		ev = perf_event_id(&counter->attr);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->hw_event);
+		ev = perf_event_config(&counter->attr);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->hw_event.exclude_hv = 0;
+		counter->attr.exclude_hv = 0;
 
 	/*
 	 * If this is a per-task counter, then we can use
@@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.record_type & PERF_RECORD_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 904571bea71..e16e8c13132 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void)
 }
 
 /*
- * Setup the hardware configuration for a given hw_event_type
+ * Setup the hardware configuration for a given attr_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
+	struct perf_counter_attr *attr = &counter->attr;
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
@@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Count user and OS events unless requested not to.
 	 */
-	if (!hw_event->exclude_user)
+	if (!attr->exclude_user)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!hw_event->exclude_kernel)
+	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	if (!hwc->sample_period)
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
+	if (perf_event_raw(attr)) {
+		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu.max_events)
+		if (perf_event_id(attr) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 45bdd3b95d3..37d5541d74c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -22,7 +22,7 @@
  */
 
 /*
- * hw_event.type
+ * attr.type
  */
 enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
@@ -37,10 +37,10 @@ enum perf_event_types {
 };
 
 /*
- * Generalized performance counter event types, used by the hw_event.event_id
+ * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum hw_event_ids {
+enum attr_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -94,7 +94,7 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.sample_type to request information
+ * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
@@ -109,7 +109,7 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in hw_event.read_format to request that
+ * Bits that can be set in attr.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
  */
@@ -122,7 +122,7 @@ enum perf_counter_read_format {
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
-struct perf_counter_hw_event {
+struct perf_counter_attr {
 	/*
 	 * The MSB of the config word signifies if the rest contains cpu
 	 * specific (raw) counter configuration data, if unset, the next
@@ -323,25 +323,25 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_raw(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_RAW_MASK;
+	return attr->config & PERF_COUNTER_RAW_MASK;
 }
 
-static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_config(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+	return attr->config & PERF_COUNTER_CONFIG_MASK;
 }
 
-static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_type(struct perf_counter_attr *attr)
 {
-	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
 		PERF_COUNTER_TYPE_SHIFT;
 }
 
-static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_id(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+	return attr->config & PERF_COUNTER_EVENT_MASK;
 }
 
 /**
@@ -457,7 +457,7 @@ struct perf_counter {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_counter_hw_event	hw_event;
+	struct perf_counter_attr	attr;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->hw_event) &&
-		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->attr) &&
+		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 79faae950e2..c6c84ad8bd7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
-struct perf_counter_hw_event;
+struct perf_counter_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
 asmlinkage long sys_perf_counter_open(
-		const struct perf_counter_hw_event __user *hw_event_uptr,
+		const struct perf_counter_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index abe2f3b6c42..317cef78a38 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu--;
 	ctx->nr_active--;
-	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+	if (counter->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 }
 
@@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter,
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
 
-	if (group_counter->hw_event.exclusive)
+	if (group_counter->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter,
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
-	if (counter->hw_event.exclusive)
+	if (counter->attr.exclusive)
 		cpuctx->exclusive = 1;
 
 	return 0;
@@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter,
 	 * If this group is exclusive and there are already
 	 * counters on the CPU, it can't go on.
 	 */
-	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+	if (counter->attr.exclusive && cpuctx->active_oncpu)
 		return 0;
 	/*
 	 * Otherwise, try to add it if all previous groups were able
@@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	/*
 	 * not supported on inherited counters
 	 */
-	if (counter->hw_event.inherit)
+	if (counter->attr.inherit)
 		return -EINVAL;
 
 	atomic_add(refresh, &counter->event_limit);
@@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->hw_event.pinned)
+		    !counter->attr.pinned)
 			continue;
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
@@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * ignore pinned counters since we did them already.
 		 */
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->hw_event.pinned)
+		    counter->attr.pinned)
 			continue;
 
 		/*
@@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
+		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
 		events = HZ * interrupts * counter->hw.sample_period;
-		period = div64_u64(events, counter->hw_event.sample_freq);
+		period = div64_u64(events, counter->attr.sample_freq);
 
 		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
@@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter)
 	perf_pending_sync(counter);
 
 	atomic_dec(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_dec(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_dec(&nr_comm_tracking);
 
 	if (counter->destroy)
@@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = counter->total_time_enabled +
 			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+	if (counter->attr.read_format & PERF_FORMAT_ID)
 		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
@@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 	int ret = 0;
 	u64 value;
 
-	if (!counter->hw_event.sample_period)
+	if (!counter->attr.sample_period)
 		return -EINVAL;
 
 	size = copy_from_user(&value, arg, sizeof(value));
@@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->lock);
-	if (counter->hw_event.freq) {
+	if (counter->attr.freq) {
 		if (value > sysctl_perf_counter_limit) {
 			ret = -EINVAL;
 			goto unlock;
 		}
 
-		counter->hw_event.sample_freq = value;
+		counter->attr.sample_freq = value;
 	} else {
-		counter->hw_event.sample_period = value;
+		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
 
 		perf_log_period(counter, value);
@@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
 
-	int wakeup_events = counter->hw_event.wakeup_events;
+	int wakeup_events = counter->attr.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
@@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 sample_type = counter->hw_event.sample_type;
+	u64 sample_type = counter->attr.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, addr);
 
 	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->hw_event.config);
+		perf_output_put(&handle, counter->attr.config);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
@@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 static int perf_counter_comm_match(struct perf_counter *counter,
 				   struct perf_comm_event *comm_event)
 {
-	if (counter->hw_event.comm &&
+	if (counter->attr.comm &&
 	    comm_event->event.header.type == PERF_EVENT_COMM)
 		return 1;
 
@@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->hw_event.mmap &&
+	if (counter->attr.mmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MMAP)
 		return 1;
 
-	if (counter->hw_event.munmap &&
+	if (counter->attr.munmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
 		return 1;
 
@@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->hw_event.exclude_kernel || !regs) &&
-			!counter->hw_event.exclude_user)
+	if ((counter->attr.exclude_kernel || !regs) &&
+			!counter->attr.exclude_user)
 		regs = task_pt_regs(current);
 
 	if (regs) {
@@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->hw_event.config != event_config)
+	if (counter->attr.config != event_config)
 		return 0;
 
 	if (regs) {
-		if (counter->hw_event.exclude_user && user_mode(regs))
+		if (counter->attr.exclude_user && user_mode(regs))
 			return 0;
 
-		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		if (counter->attr.exclude_kernel && !user_mode(regs))
 			return 0;
 	}
 
@@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->hw_event));
+	ftrace_profile_disable(perf_event_id(&counter->attr));
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->hw_event);
+	int event_id = perf_event_id(&counter->attr);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->hw_event.sample_period;
+	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->hw_event)) {
+	switch (perf_event_id(&counter->attr)) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+perf_counter_alloc(struct perf_counter_attr *attr,
 		   int cpu,
 		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
@@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	mutex_init(&counter->mmap_mutex);
 
 	counter->cpu			= cpu;
-	counter->hw_event		= *hw_event;
+	counter->attr		= *attr;
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (hw_event->disabled)
+	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
 	else
-		hwc->sample_period = hw_event->sample_period;
+		hwc->sample_period = attr->sample_period;
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(hw_event)) {
+	if (perf_event_raw(attr)) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(hw_event)) {
+	switch (perf_event_type(attr)) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
@@ -3409,11 +3409,11 @@ done:
 	counter->pmu = pmu;
 
 	atomic_inc(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_inc(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_inc(&nr_comm_tracking);
 
 	return counter;
@@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id;
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
- * @hw_event_uptr:	event type attributes for monitoring/sampling
+ * @attr_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		const struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	struct perf_counter_context *ctx;
 	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
@@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
+	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
 	/*
@@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
-		if (hw_event.exclusive || hw_event.pinned)
+		if (attr.exclusive || attr.pinned)
 			goto err_put_context;
 	}
 
-	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
 				     GFP_KERNEL);
 	ret = PTR_ERR(counter);
 	if (IS_ERR(counter))
@@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (parent_counter->parent)
 		parent_counter = parent_counter->parent;
 
-	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+	child_counter = perf_counter_alloc(&parent_counter->attr,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
@@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter,
 
 	/*
 	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
 	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
@@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * inherit into child's child as well:
 	 */
-	child_counter->hw_event.inherit = 1;
+	child_counter->attr.inherit = 1;
 
 	/*
 	 * Get a reference to the parent filp - we will fput it
@@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit) {
+		if (!counter->attr.inherit) {
 			inherited_all = 0;
 			continue;
 		}
-- 
cgit v1.2.3-70-g09d2


From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 3 Jun 2009 13:12:55 +0800
Subject: perf_counter/x86: Remove the IRQ (non-NMI) handling bits

Remove the IRQ (non-NMI) handling bits as NMI will be used always.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/entry_arch.h  |  1 -
 arch/x86/include/asm/hw_irq.h      |  1 -
 arch/x86/include/asm/irq_vectors.h |  5 -----
 arch/x86/kernel/cpu/perf_counter.c | 21 ++++++---------------
 arch/x86/kernel/entry_64.S         |  2 --
 arch/x86/kernel/irqinit_32.c       |  1 -
 arch/x86/kernel/irqinit_64.c       |  1 -
 7 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index fe24d280249..d750a10ccad 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,6 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
 BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 7309c0ad690..4b4921d7a28 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,6 @@
 extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
-extern void perf_counter_interrupt(void);
 extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 545bb811ccb..4492e19f839 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,11 +106,6 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
-/*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR		0xee
-
 /*
  * Generic system vector for platform specific use
  */
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16e8c13132..12cc05ed9f4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -40,7 +40,7 @@ struct cpu_hw_counters {
 struct x86_pmu {
 	const char	*name;
 	int		version;
-	int		(*handle_irq)(struct pt_regs *, int);
+	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
 	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
@@ -755,7 +755,7 @@ static void intel_pmu_reset(void)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct cpu_hw_counters *cpuc;
 	struct cpu_hw_counters;
@@ -794,7 +794,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -812,7 +812,7 @@ again:
 	return 1;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
 	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
@@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void smp_perf_counter_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	ack_APIC_irq();
-	x86_pmu.handle_irq(regs, 0);
-	irq_exit();
-}
-
 void smp_perf_pending_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
@@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	 * If the first NMI handles both, the latter will be empty and daze
 	 * the CPU.
 	 */
-	x86_pmu.handle_irq(regs, 1);
+	x86_pmu.handle_irq(regs);
 
 	return NOTIFY_STOP;
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89100461914..7985c010f8a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
 #ifdef CONFIG_PERF_COUNTERS
-apicinterrupt LOCAL_PERF_VECTOR \
-	perf_counter_interrupt smp_perf_counter_interrupt
 apicinterrupt LOCAL_PENDING_VECTOR \
 	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 3190a6b961e..205bdd880d3 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -165,7 +165,6 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 53ceb26f80f..fa6ef692000 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -155,7 +155,6 @@ static void __init apic_intr_init(void)
 
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
-- 
cgit v1.2.3-70-g09d2


From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 22:19:36 +0200
Subject: perf_counter: Fix throttling lock-up

Throttling logic is broken and we can lock up with too small
hw sampling intervals.

Make the throttling code more robust: disable counters even
if we already disabled them.

( Also clean up whitespace damage i noticed while reading
  various pieces of code related to throttling. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 kernel/perf_counter.c              | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 12cc05ed9f4..8f53f3a7da2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event)
 #define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
-#define CORE_EVNTSEL_MASK 		\
+#define CORE_EVNTSEL_MASK		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
 	 CORE_EVNTSEL_EDGE_MASK  |	\
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ab4455447f8..0bb03f15a5b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2822,11 +2822,20 @@ int perf_counter_overflow(struct perf_counter *counter,
 
 	if (!throttle) {
 		counter->hw.interrupts++;
-	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
-		counter->hw.interrupts++;
-		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-			counter->hw.interrupts = MAX_INTERRUPTS;
-			perf_log_throttle(counter, 0);
+	} else {
+		if (counter->hw.interrupts != MAX_INTERRUPTS) {
+			counter->hw.interrupts++;
+			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+				counter->hw.interrupts = MAX_INTERRUPTS;
+				perf_log_throttle(counter, 0);
+				ret = 1;
+			}
+		} else {
+			/*
+			 * Keep re-disabling counters even though on the previous
+			 * pass we disabled it - just in case we raced with a
+			 * sched-in and the counter got enabled again:
+			 */
 			ret = 1;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From f7b6eb3fa07269da20dbbde8ba37a0273fdbd9c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:51 +0200
Subject: x86: Set context.vdso before installing the mapping

In order to make arch_vma_name() work from inside
install_special_mapping() we need to set the context.vdso
before calling it.

( This is needed for performance counters to be able to track
  this special executable area. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/vdso32-setup.c | 6 +++++-
 arch/x86/vdso/vma.c          | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab5..58bc00f68b1 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		}
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	if (compat_uses_vma || !compat) {
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 			goto up_fail;
 	}
 
-	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
 		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
 
   up_fail:
+	if (ret)
+		current->mm->context.vdso = NULL;
+
 	up_write(&mm->mmap_sem);
 
 	return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098..93b7a2938b2 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -115,15 +115,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		goto up_fail;
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
 				      vdso_pages);
-	if (ret)
+	if (ret) {
+		current->mm->context.vdso = NULL;
 		goto up_fail;
+	}
 
-	current->mm->context.vdso = (void *)addr;
 up_fail:
 	up_write(&mm->mmap_sem);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 09:58:57 +0200
Subject: perf_counter: Separate out attr->type from attr->config

Counter type is a frequently used value and we do a lot of
bit juggling by encoding and decoding it from attr->config.

Clean this up by creating a separate attr->type field.

Also clean up the various similarly complex user-space bits
all around counter attribute management.

The net improvement is significant, and it will be easier
to add a new major type (which is what triggered this cleanup).

(This changes the ABI, all tools are adapted.)
(PowerPC build-tested.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c    | 105 ++++++++++------------
 Documentation/perf_counter/builtin-stat.c      |  76 +++++++---------
 Documentation/perf_counter/builtin-top.c       |  67 +++++---------
 Documentation/perf_counter/perf.h              |   2 -
 Documentation/perf_counter/util/parse-events.c | 120 ++++++++++++++-----------
 Documentation/perf_counter/util/parse-events.h |   7 +-
 arch/powerpc/kernel/perf_counter.c             |   6 +-
 arch/x86/kernel/cpu/perf_counter.c             |   8 +-
 include/linux/perf_counter.h                   |  65 +++-----------
 kernel/perf_counter.c                          |  14 ++-
 10 files changed, 196 insertions(+), 274 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index c22ea0c7472..130fd88266b 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -20,10 +20,10 @@
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static long			default_interval = 100000;
-static long			event_count[MAX_COUNTERS];
-
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static long			default_interval		= 100000;
+
 static int			nr_cpus				= 0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 128;
@@ -38,22 +38,44 @@ static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
 
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
+static long			samples;
+static struct timeval		last_read;
+static struct timeval		this_read;
+
+static __u64			bytes_written;
+
+static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
+
+static int			nr_poll;
+static int			nr_cpu;
+
+struct mmap_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
 };
 
+
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
+static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
@@ -65,11 +87,6 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
-static long samples;
-static struct timeval last_read, this_read;
-
-static __u64 bytes_written;
-
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -157,29 +174,6 @@ static void sig_handler(int sig)
 	done = 1;
 }
 
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int nr_poll;
-static int nr_cpu;
-
-struct mmap_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	__u64				start;
-	__u64				len;
-	__u64				pgoff;
-	char				filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	char				comm[16];
-};
-
 static void pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
@@ -341,24 +335,21 @@ static int group_fd;
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr = attrs + counter;
 	int track = 1;
 
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_period	= event_count[counter];
-	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
 	if (freq) {
-		attr.freq		= 1;
-		attr.sample_freq	= freq;
+		attr->freq		= 1;
+		attr->sample_freq	= freq;
 	}
-	attr.mmap		= track;
-	attr.comm		= track;
-	attr.inherit		= (cpu < 0) && inherit;
+	attr->mmap		= track;
+	attr->comm		= track;
+	attr->inherit		= (cpu < 0) && inherit;
 
 	track = 0; /* only the first counter needs these */
 
-	fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
+	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
@@ -542,16 +533,14 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	return __cmd_record(argc, argv);
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 4fc0d80440e..9711e552423 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -44,23 +44,22 @@
 
 #include <sys/prctl.h>
 
-static int			system_wide			=  0;
-static int			inherit				=  1;
+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-static __u64			default_event_id[MAX_COUNTERS]	= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
 };
 
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
+static int			system_wide			=  0;
+static int			inherit				=  1;
+
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int			target_pid			= -1;
@@ -86,22 +85,16 @@ static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
-	struct perf_counter_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_type	= 0;
-	attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
-	attr.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+	struct perf_counter_attr *attr = attrs + counter;
 
 	if (scale)
-		attr.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0) {
 				printf("perfstat error: syscall returned with %d (%s)\n",
 						fd[cpu][counter], strerror(errno));
@@ -109,10 +102,10 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		attr.inherit	= inherit;
-		attr.disabled	= 1;
+		attr->inherit	= inherit;
+		attr->disabled	= 1;
 
-		fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0);
+		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
 		if (fd[0][counter] < 0) {
 			printf("perfstat error: syscall returned with %d (%s)\n",
 					fd[0][counter], strerror(errno));
@@ -126,9 +119,13 @@ static void create_perfstat_counter(int counter)
  */
 static inline int nsec_counter(int counter)
 {
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK))
+	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
+		return 0;
+
+	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
 		return 1;
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+
+	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -177,7 +174,8 @@ static void read_counter(int counter)
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		runtime_nsecs = count[0];
 }
 
@@ -203,8 +201,8 @@ static void print_counter(int counter)
 
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
-		if (event_id[counter] ==
-				EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
 
 			fprintf(stderr, " # %11.3f CPU utilization factor",
 				(double)count[0] / (double)walltime_nsecs);
@@ -300,8 +298,6 @@ static char events_help_msg[EVENTS_HELP_MAX];
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
@@ -315,27 +311,19 @@ static const struct option options[] = {
 
 int cmd_stat(int argc, const char **argv, const char *prefix)
 {
-	int counter;
-
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	memcpy(attrs, default_attrs, sizeof(attrs));
 
 	argc = parse_options(argc, argv, options, stat_usage, 0);
 	if (!argc)
 		usage_with_options(stat_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 8;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
 
-		event_count[counter] = default_interval;
-	}
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b2f480b5a13..98a6d53e17b 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -48,22 +48,11 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static int			system_wide			=  0;
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static __u64			default_event_id[MAX_COUNTERS]		= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+static int			system_wide			=  0;
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
-};
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			default_interval		= 100000;
 
 static __u64			count_filter			=  5;
 static int			print_entries			= 15;
@@ -85,15 +74,6 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
 /*
  * Symbols
  */
@@ -112,7 +92,7 @@ struct sym_entry {
 
 struct sym_entry		*sym_filter_entry;
 
-struct dso *kernel_dso;
+struct dso			*kernel_dso;
 
 /*
  * Symbols will be added here in record_ip and will get out
@@ -213,7 +193,7 @@ static void print_sym_table(void)
 		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
-		printf("%d", event_count[0]);
+		printf("%Ld", attrs[0].sample_period);
 		if (freq)
 			printf("Hz ");
 		else
@@ -421,10 +401,10 @@ static void process_event(uint64_t ip, int counter)
 }
 
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
 static unsigned int mmap_read_head(struct mmap_data *md)
@@ -539,7 +519,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int __cmd_top(void)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
@@ -553,13 +533,12 @@ static int __cmd_top(void)
 			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
-			memset(&attr, 0, sizeof(attr));
-			attr.config		= event_id[counter];
-			attr.sample_period	= event_count[counter];
-			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr.freq		= freq;
+			attr = attrs + counter;
 
-			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
+			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr->freq		= freq;
+
+			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 
@@ -670,7 +649,6 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
@@ -688,19 +666,22 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 		profile_cpu = -1;
 	}
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	if (delay_secs < 1)
 		delay_secs = 1;
 
+	parse_symbols();
+
+	/*
+	 * Fill in the ones not specifically initialized via -c:
+	 */
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -710,7 +691,5 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	if (target_pid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
-	parse_symbols();
-
 	return __cmd_top();
 }
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 10622a48b40..af0a5046d74 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -64,6 +64,4 @@ sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
 #define MAX_COUNTERS			256
 #define MAX_NR_CPUS			256
 
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
 #endif
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 2fdfd1d923f..eb56bd99657 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,37 +6,39 @@
 #include "exec_cmd.h"
 #include "string.h"
 
-int nr_counters;
+int					nr_counters;
 
-__u64			event_id[MAX_COUNTERS]		= { };
-int			event_mask[MAX_COUNTERS];
+struct perf_counter_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
-	__u64 event;
-	char *symbol;
+	__u8	type;
+	__u64	config;
+	char	*symbol;
 };
 
+#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+
 static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
+  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
+  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
+  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -67,27 +69,26 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
-char *event_name(int ctr)
+char *event_name(int counter)
 {
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
+	__u64 config = attrs[counter].config;
+	int type = attrs[counter].type;
 	static char buf[32];
 
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+	if (attrs[counter].type == PERF_TYPE_RAW) {
+		sprintf(buf, "raw 0x%llx", config);
 		return buf;
 	}
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
+		if (config < PERF_HW_EVENTS_MAX)
+			return hw_event_names[config];
 		return "unknown-hardware";
 
 	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
+		if (config < PERF_SW_EVENTS_MAX)
+			return sw_event_names[config];
 		return "unknown-software";
 
 	default:
@@ -101,15 +102,19 @@ char *event_name(int ctr)
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static __u64 match_event_symbols(const char *str)
+static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
 	unsigned int i;
 	const char *sep, *pstr;
 
-	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0)
-		return config | PERF_COUNTER_RAW_MASK;
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
+		attr->type = PERF_TYPE_RAW;
+		attr->config = config;
+
+		return 0;
+	}
 
 	pstr = str;
 	sep = strchr(pstr, ':');
@@ -121,35 +126,45 @@ static __u64 match_event_symbols(const char *str)
 		if (sep) {
 			pstr = sep + 1;
 			if (strchr(pstr, 'k'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
+				attr->exclude_user = 1;
 			if (strchr(pstr, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+				attr->exclude_kernel = 1;
 		}
-		return EID(type, id);
+		attr->type = type;
+		attr->config = id;
+
+		return 0;
 	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
+			     strlen(event_symbols[i].symbol))) {
+
+			attr->type = event_symbols[i].type;
+			attr->config = event_symbols[i].config;
+
+			return 0;
+		}
 	}
 
-	return ~0ULL;
+	return -EINVAL;
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
 {
-	__u64 config;
+	struct perf_counter_attr attr;
+	int ret;
 
+	memset(&attr, 0, sizeof(attr));
 again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
+	ret = match_event_symbols(str, &attr);
+	if (ret < 0)
+		return ret;
 
-	event_id[nr_counters] = config;
+	attrs[nr_counters] = attr;
 	nr_counters++;
 
 	str = strstr(str, ",");
@@ -168,7 +183,6 @@ void create_events_help(char *events_help_msg)
 {
 	unsigned int i;
 	char *str;
-	__u64 e;
 
 	str = events_help_msg;
 
@@ -178,9 +192,8 @@ void create_events_help(char *events_help_msg)
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		int type, id;
 
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
+		type = event_symbols[i].type;
+		id = event_symbols[i].config;
 
 		if (i)
 			str += sprintf(str, "|");
@@ -191,4 +204,3 @@ void create_events_help(char *events_help_msg)
 
 	str += sprintf(str, "|rNNN]");
 }
-
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 0da306bb902..542971c495b 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -3,12 +3,9 @@
  * Parse symbolic events/counts passed in as options:
  */
 
-extern int nr_counters;
-extern __u64			event_id[MAX_COUNTERS];
-extern int			event_mask[MAX_COUNTERS];
+extern int			nr_counters;
 
-#define EVENT_MASK_KERNEL	1
-#define EVENT_MASK_USER		2
+extern struct perf_counter_attr attrs[MAX_COUNTERS];
 
 extern char *event_name(int ctr);
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 232b00a36f7..4786ad9a288 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->attr)) {
-		ev = perf_event_id(&counter->attr);
+	if (counter->attr.type != PERF_TYPE_RAW) {
+		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->attr);
+		ev = counter->attr.config;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8f53f3a7da2..430e048f285 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(attr)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
 	} else {
-		if (perf_event_id(attr) >= x86_pmu.max_events)
+		if (attr->config >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
+		hwc->config |= x86_pmu.event_map(attr->config);
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4f9d39ecdc0..f794c69b34c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,26 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name)			\
-	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
-	 PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW_BITS		1
-#define PERF_COUNTER_RAW_SHIFT		63
-#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
-
-#define PERF_COUNTER_CONFIG_BITS	63
-#define PERF_COUNTER_CONFIG_SHIFT	0
-#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
-
-#define PERF_COUNTER_TYPE_BITS		7
-#define PERF_COUNTER_TYPE_SHIFT		56
-#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
-
-#define PERF_COUNTER_EVENT_BITS		56
-#define PERF_COUNTER_EVENT_SHIFT	0
-#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
-
 /*
  * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
@@ -125,10 +105,13 @@ enum perf_counter_read_format {
  */
 struct perf_counter_attr {
 	/*
-	 * The MSB of the config word signifies if the rest contains cpu
-	 * specific (raw) counter configuration data, if unset, the next
-	 * 7 bits are an event type and the rest of the bits are the event
-	 * identifier.
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+	__u32			__reserved_1;
+
+	/*
+	 * Type specific configuration information.
 	 */
 	__u64			config;
 
@@ -152,12 +135,11 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 53;
+				__reserved_2   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
-	__u32			__reserved_2;
+	__u32			__reserved_3;
 
-	__u64			__reserved_3;
 	__u64			__reserved_4;
 };
 
@@ -278,8 +260,8 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u32				pid, ppid;
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
 	 * };
 	 */
 	PERF_EVENT_FORK			= 7,
@@ -331,27 +313,6 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_RAW_MASK;
-}
-
-static inline u64 perf_event_config(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_CONFIG_MASK;
-}
-
-static inline u64 perf_event_type(struct perf_counter_attr *attr)
-{
-	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
-		PERF_COUNTER_TYPE_SHIFT;
-}
-
-static inline u64 perf_event_id(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_EVENT_MASK;
-}
-
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -616,8 +577,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->attr) &&
-		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
+	return (counter->attr.type != PERF_TYPE_RAW) &&
+		(counter->attr.type != PERF_TYPE_HARDWARE);
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 47c92fb927f..75ae76796df 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3091,14 +3091,12 @@ static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_event_types type,
 				u32 event, struct pt_regs *regs)
 {
-	u64 event_config;
-
-	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
-
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->attr.config != event_config)
+	if (counter->attr.type != type)
+		return 0;
+	if (counter->attr.config != event)
 		return 0;
 
 	if (regs) {
@@ -3403,7 +3401,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->attr)) {
+	switch (counter->attr.config) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3496,12 +3494,12 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(attr)) {
+	if (attr->type == PERF_TYPE_RAW) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(attr)) {
+	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
-- 
cgit v1.2.3-70-g09d2


From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 20:22:46 +0200
Subject: perf_counter: Implement generalized cache event types

Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.

This is a 3-dimensional space:

       { L1-D, L1-I, L2, ITLB, DTLB, BPU } x
       { load, store, prefetch } x
       { accesses, misses }

User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)

Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.

Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.

( x86 is supported for now, with the Nehalem event table filled in,
  and with Core2 and Atom having placeholder tables. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/parse-events.c | 104 ++++++++++++-
 arch/x86/kernel/cpu/perf_counter.c             | 201 ++++++++++++++++++++++++-
 include/linux/perf_counter.h                   |  34 +++++
 kernel/perf_counter.c                          |   1 +
 4 files changed, 329 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index eb56bd99657..de9a77c4715 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,6 +6,8 @@
 #include "exec_cmd.h"
 #include "string.h"
 
+extern char *strcasestr(const char *haystack, const char *needle);
+
 int					nr_counters;
 
 struct perf_counter_attr		attrs[MAX_COUNTERS];
@@ -17,6 +19,7 @@ struct event_symbol {
 };
 
 #define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+#define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
   { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
@@ -69,6 +72,28 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
+#define MAX_ALIASES 8
+
+static char *hw_cache [][MAX_ALIASES] = {
+	{ "l1-d" ,	"l1d" ,	"l1", "l1-data-cache"			},
+	{ "l1-i" ,	"l1i" ,	"l1-instruction-cache"		},
+	{ "l2"  , },
+	{ "dtlb", },
+	{ "itlb", },
+	{ "bpu" , "btb", "branch-cache", NULL },
+};
+
+static char *hw_cache_op [][MAX_ALIASES] = {
+	{ "read"	, "load" },
+	{ "write"	, "store" },
+	{ "prefetch"	, "speculative-read", "speculative-load" },
+};
+
+static char *hw_cache_result [][MAX_ALIASES] = {
+	{ "access", "ops" },
+	{ "miss", },
+};
+
 char *event_name(int counter)
 {
 	__u64 config = attrs[counter].config;
@@ -86,6 +111,30 @@ char *event_name(int counter)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
+	case PERF_TYPE_HW_CACHE: {
+		__u8 cache_type, cache_op, cache_result;
+		static char name[100];
+
+		cache_type   = (config >>  0) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_MAX)
+			return "unknown-ext-hardware-cache-type";
+
+		cache_op     = (config >>  8) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op-type";
+
+		cache_result = (config >> 16) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result-type";
+
+		sprintf(name, "%s:%s:%s",
+			hw_cache[cache_type][0],
+			hw_cache_op[cache_op][0],
+			hw_cache_result[cache_result][0]);
+
+		return name;
+	}
+
 	case PERF_TYPE_SOFTWARE:
 		if (config < PERF_SW_EVENTS_MAX)
 			return sw_event_names[config];
@@ -98,11 +147,60 @@ char *event_name(int counter)
 	return "unknown";
 }
 
+static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
+{
+	int i, j;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < MAX_ALIASES; j++) {
+			if (!names[i][j])
+				break;
+			if (strcasestr(str, names[i][j]))
+				return i;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
+{
+	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+
+	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
+	/*
+	 * No fallback - if we cannot get a clear cache type
+	 * then bail out:
+	 */
+	if (cache_type == -1)
+		return -EINVAL;
+
+	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+	/*
+	 * Fall back to reads:
+	 */
+	if (cache_type == -1)
+		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+
+	cache_result = parse_aliases(str, hw_cache_result,
+					PERF_COUNT_HW_CACHE_RESULT_MAX);
+	/*
+	 * Fall back to accesses:
+	 */
+	if (cache_result == -1)
+		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
+
+	attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
+	attr->type = PERF_TYPE_HW_CACHE;
+
+	return 0;
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
+static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
@@ -147,7 +245,7 @@ static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 		}
 	}
 
-	return -EINVAL;
+	return parse_generic_hw_symbols(str, attr);
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
@@ -160,7 +258,7 @@ again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	ret = match_event_symbols(str, &attr);
+	ret = parse_event_symbols(str, &attr);
 	if (ret < 0)
 		return ret;
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 430e048f285..e86679fa521 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
+		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
 static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void)
 	return x86_pmu.handle_irq != NULL;
 }
 
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->sample_period);
+	counter->destroy = hw_perf_counter_destroy;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
 	if (attr->type == PERF_TYPE_RAW) {
 		hwc->config |= x86_pmu.raw_event(attr->config);
-	} else {
-		if (attr->config >= x86_pmu.max_events)
-			return -EINVAL;
-		/*
-		 * The generic map:
-		 */
-		hwc->config |= x86_pmu.event_map(attr->config);
+		return 0;
 	}
 
-	counter->destroy = hw_perf_counter_destroy;
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+	/*
+	 * The generic map:
+	 */
+	hwc->config |= x86_pmu.event_map(attr->config);
 
 	return 0;
 }
@@ -989,6 +1147,33 @@ static int intel_pmu_init(void)
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
+	/*
+	 * Nehalem:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 17:
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Core2 event tables\n");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Nehalem/Corei7 event tables\n");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Atom event tables\n");
+		break;
+	}
 	return 0;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f794c69b34c..3586df840f6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -28,6 +28,7 @@ enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
+	PERF_TYPE_HW_CACHE		= 3,
 
 	/*
 	 * available TYPE space, raw is the max value.
@@ -55,6 +56,39 @@ enum attr_ids {
 	PERF_HW_EVENTS_MAX		= 7,
 };
 
+/*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D,
+	PERF_COUNT_HW_CACHE_L1I,
+	PERF_COUNT_HW_CACHE_L2,
+	PERF_COUNT_HW_CACHE_DTLB,
+	PERF_COUNT_HW_CACHE_ITLB,
+	PERF_COUNT_HW_CACHE_BPU,
+
+	PERF_COUNT_HW_CACHE_MAX,
+};
+
+enum hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ,
+	PERF_COUNT_HW_CACHE_OP_WRITE,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,
+};
+
+enum hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
+	PERF_COUNT_HW_CACHE_RESULT_MISS,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,
+};
+
 /*
  * Special "software" counters provided by the kernel, even if the hardware
  * does not support performance counters. These counters measure various
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75ae76796df..5eacaaf3f9c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3501,6 +3501,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
 		pmu = hw_perf_counter_init(counter);
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 07:42:04 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add Core2
 support

Fill in core2_hw_cache_event_id[] with the Core2 model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e86679fa521..b1f71ff5025 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static const u64 atom_hw_cache_event_ids
-- 
cgit v1.2.3-70-g09d2


From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 09:30:41 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add Atom
 support

Fill in core2_hw_cache_event_id[] with the Atom model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

( Note: these are straight from the Intel manuals - not tested yet.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b1f71ff5025..71590e09d16 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static u64 intel_pmu_raw_event(u64 event)
-- 
cgit v1.2.3-70-g09d2


From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: perf_counter: Clean up x86 boot messages

Standardize and tidy up all the messages we print during
perfcounter initialization.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 46 ++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 71590e09d16..0339d195a3f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1298,23 +1298,22 @@ static int intel_pmu_init(void)
 	if (version < 2)
 		return -ENODEV;
 
-	x86_pmu = intel_pmu;
-	x86_pmu.version = version;
-	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu				= intel_pmu;
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.counter_bits		= eax.split.bit_width;
+	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
 	 * assume at least 3 counters:
 	 */
-	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-	x86_pmu.counter_bits = eax.split.bit_width;
-	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	/*
-	 * Nehalem:
+	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
 	case 17:
@@ -1322,7 +1321,7 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Core2 event tables\n");
+		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
@@ -1330,14 +1329,14 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Nehalem/Corei7 event tables\n");
+		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Atom event tables\n");
+		pr_cont("Atom events, ");
 		break;
 	}
 	return 0;
@@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void)
 {
 	int err;
 
+	pr_info("Performance Counters: ");
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		err = intel_pmu_init();
@@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void)
 	default:
 		return;
 	}
-	if (err != 0)
+	if (err != 0) {
+		pr_cont("no PMU driver, software counters only.\n");
 		return;
+	}
 
-	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
-	pr_info("... version:         %d\n", x86_pmu.version);
-	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
-	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
 	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
-	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
-
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
 	perf_counter_mask |=
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-
 	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
+
+	pr_info("... version:                 %d\n",     x86_pmu.version);
+	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
+	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
 }
 
 static inline void x86_pmu_read(struct perf_counter *counter)
-- 
cgit v1.2.3-70-g09d2


From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 22:33:10 +0200
Subject: perf_counter, x86: Implement generalized cache event types, add AMD
 support

Fill in amd_hw_cache_event_id[] with the AMD CPU specific events,
for family 0x0f, 0x10 and 0x11.

There's apparently no distinction between load and store events, so
we only fill in the load events.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 102 +++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0339d195a3f..93af821ebe5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event)
 	return event & CORE_EVNTSEL_MASK;
 }
 
+static const u64 amd_0f_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * AMD Performance Monitor K7 and later.
  */
@@ -1345,6 +1436,17 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
+
+	switch (boot_cpu_data.x86) {
+	case 0x0f:
+	case 0x10:
+	case 0x11:
+		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("AMD Family 0f/10/11 events, ");
+		break;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 19:10:25 +0200
Subject: perf_counter, x86: Clean up hw_cache_event ids copies

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 93af821ebe5..56001feeffc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1409,23 +1409,20 @@ static int intel_pmu_init(void)
 	switch (boot_cpu_data.x86_model) {
 	case 17:
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Atom events, ");
 		break;
-- 
cgit v1.2.3-70-g09d2


From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Tue, 9 Jun 2009 21:15:53 +0800
Subject: perf_counter, x86: Correct some event and umask values for Intel
 processors

Correct some event and UMASK values according to Intel SDM,
in the Nehalem and Atom tables.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 56001feeffc..40978aac6e0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
 		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
 	},
 	[ C(OP_WRITE) ] = {
@@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
@@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
 	},
 	[ C(OP_PREFETCH) ] = {
-- 
cgit v1.2.3-70-g09d2


From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 10 Jun 2009 17:06:12 +0800
Subject: perf_counter/x86: Fix the model number of Intel Core2 processors

Fix the model number of Intel Core2 processors according to the
documentation: Intel Processor Identification with the CPUID
Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Also-Reported-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com>
[ Added two more model numbers suggested by Arnd Bergmann ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 40978aac6e0..49f258537cb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1407,7 +1407,10 @@ static int intel_pmu_init(void)
 	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
-	case 17:
+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 29: /* six-core 45 nm xeon "Dunnington" */
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-- 
cgit v1.2.3-70-g09d2


From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 13:40:57 +0200
Subject: perf_counter: More aggressive frequency adjustment

Also employ the overflow handler to adjust the frequency, this results
in a stable frequency in about 40~50 samples, instead of that many ticks.

This also means we can start sampling at a sample period of 1 without
running head-first into the throttle.

It relies on sched_clock() to accurately measure the time difference
between the overflow NMIs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |   5 +-
 include/linux/perf_counter.h       |   1 +
 kernel/perf_counter.c              | 130 +++++++++++++++++++++++++------------
 3 files changed, 92 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 49f258537cb..240ca563063 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (!hwc->sample_period)
+	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
 	counter->destroy = hw_perf_counter_destroy;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3586df840f6..282d8cc4898 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -371,6 +371,7 @@ struct hw_perf_counter {
 
 	u64				freq_count;
 	u64				freq_interrupts;
+	u64				freq_stamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5eacaaf3f9c..51c571ee4d0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1184,13 +1184,33 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
-static void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 period, sample_period;
+	s64 delta;
+
+	events *= hwc->sample_period;
+	period = div64_u64(events, counter->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	perf_log_period(counter, sample_period);
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	u64 interrupts, sample_period;
-	u64 events, period, freq;
-	s64 delta;
+	u64 interrupts, freq;
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -1202,6 +1222,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		interrupts = hwc->interrupts;
 		hwc->interrupts = 0;
 
+		/*
+		 * unthrottle counters on the tick
+		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
@@ -1211,6 +1234,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
 		if (counter->attr.sample_freq < HZ) {
 			freq = counter->attr.sample_freq;
 
@@ -1226,20 +1252,20 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		} else
 			freq = HZ;
 
-		events = freq * interrupts * hwc->sample_period;
-		period = div64_u64(events, counter->attr.sample_freq);
-
-		delta = (s64)(1 + period - hwc->sample_period);
-		delta >>= 1;
-
-		sample_period = hwc->sample_period + delta;
-
-		if (!sample_period)
-			sample_period = 1;
+		perf_adjust_period(counter, freq * interrupts);
 
-		perf_log_period(counter, sample_period);
-
-		hwc->sample_period = sample_period;
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			counter->pmu->disable(counter);
+			atomic_set(&hwc->period_left, 0);
+			counter->pmu->enable(counter);
+			perf_enable();
+		}
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -1279,9 +1305,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = curr->perf_counter_ctxp;
 
-	perf_adjust_freq(&cpuctx->ctx);
+	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
-		perf_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
 	if (ctx)
@@ -1647,10 +1673,10 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 		counter->attr.sample_freq = value;
 	} else {
+		perf_log_period(counter, value);
+
 		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
-
-		perf_log_period(counter, value);
 	}
 unlock:
 	spin_unlock_irq(&ctx->lock);
@@ -2853,35 +2879,41 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
  * event flow.
  */
 
+struct freq_event {
+	struct perf_event_header	header;
+	u64				time;
+	u64				id;
+	u64				period;
+};
+
 static void perf_log_period(struct perf_counter *counter, u64 period)
 {
 	struct perf_output_handle handle;
+	struct freq_event event;
 	int ret;
 
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				id;
-		u64				period;
-	} freq_event = {
+	if (counter->hw.sample_period == period)
+		return;
+
+	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+		return;
+
+	event = (struct freq_event) {
 		.header = {
 			.type = PERF_EVENT_PERIOD,
 			.misc = 0,
-			.size = sizeof(freq_event),
+			.size = sizeof(event),
 		},
 		.time = sched_clock(),
 		.id = counter->id,
 		.period = period,
 	};
 
-	if (counter->hw.sample_period == period)
-		return;
-
-	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
 	if (ret)
 		return;
 
-	perf_output_put(&handle, freq_event);
+	perf_output_put(&handle, event);
 	perf_output_end(&handle);
 }
 
@@ -2923,15 +2955,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 	int ret = 0;
 
 	if (!throttle) {
-		counter->hw.interrupts++;
+		hwc->interrupts++;
 	} else {
-		if (counter->hw.interrupts != MAX_INTERRUPTS) {
-			counter->hw.interrupts++;
-			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-				counter->hw.interrupts = MAX_INTERRUPTS;
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
 			}
@@ -2945,6 +2978,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 		}
 	}
 
+	if (counter->attr.freq) {
+		u64 now = sched_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+	}
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -3379,7 +3422,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3483,10 +3525,11 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	pmu = NULL;
 
 	hwc = &counter->hw;
+	hwc->sample_period = attr->sample_period;
 	if (attr->freq && attr->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
-	else
-		hwc->sample_period = attr->sample_period;
+		hwc->sample_period = 1;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
@@ -3687,6 +3730,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	else
 		child_counter->state = PERF_COUNTER_STATE_OFF;
 
+	if (parent_counter->attr.freq)
+		child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
 	/*
 	 * Link it up in the child's context:
 	 */
-- 
cgit v1.2.3-70-g09d2


From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:02:22 +0200
Subject: perf_counter: Introduce struct for sample data

For easy extension of the sample data, put it in a structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 10 +++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++----
 include/linux/perf_counter.h       | 10 ++++++++--
 kernel/perf_counter.c              | 38 ++++++++++++++++++++++----------------
 4 files changed, 48 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4786ad9a288..5e0bf399c43 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record) {
-		addr = 0;
+		struct perf_sample_data data = {
+			.regs = regs,
+			.addr = 0,
+		};
+
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
@@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
 				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-				addr = mfspr(SPRN_SDAR);
+				data.addr = mfspr(SPRN_SDAR);
 		}
-		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+		if (perf_counter_overflow(counter, nmi, &data)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
 			 * by setting the counter to 0, so it will be
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 240ca563063..82a23d487f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
-	struct cpu_hw_counters;
 	int bit, cpu, loops;
 	u64 ack, status;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1210,7 +1213,7 @@ again:
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1230,12 +1233,16 @@ again:
 
 static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
-	int cpu, idx, handled = 0;
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
+	int cpu, idx, handled = 0;
 	u64 val;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 282d8cc4898..d8c0eb480f9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -605,8 +605,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs, u64 addr);
+struct perf_sample_data {
+	struct pt_regs	*regs;
+	u64		addr;
+};
+
+extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+				 struct perf_sample_data *data);
+
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ae591a1275a..4fe85e804f4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2378,8 +2378,8 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs, u64 addr)
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data)
 {
 	int ret;
 	u64 sample_type = counter->attr.sample_type;
@@ -2404,10 +2404,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= perf_misc_flags(regs);
+	header.misc |= perf_misc_flags(data->regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(regs);
+		ip = perf_instruction_pointer(data->regs);
 		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
@@ -2460,7 +2460,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	}
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(regs);
+		callchain = perf_callchain(data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -2486,7 +2486,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, time);
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, addr);
+		perf_output_put(&handle, data->addr);
 
 	if (sample_type & PERF_SAMPLE_ID)
 		perf_output_put(&handle, counter->id);
@@ -2950,8 +2950,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+			  struct perf_sample_data *data)
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
@@ -3005,7 +3005,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs, addr);
+	perf_counter_output(counter, nmi, data);
 	return ret;
 }
 
@@ -3054,24 +3054,25 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
 	struct perf_counter *counter;
-	struct pt_regs *regs;
 	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
 
-	regs = get_irq_regs();
+	data.addr = 0;
+	data.regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !regs) &&
+	if ((counter->attr.exclude_kernel || !data.regs) &&
 			!counter->attr.exclude_user)
-		regs = task_pt_regs(current);
+		data.regs = task_pt_regs(current);
 
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs, 0))
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3084,9 +3085,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
+	struct perf_sample_data data = {
+		.regs = regs,
+		.addr = addr,
+	};
+
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs, addr))
+	if (perf_counter_overflow(counter, nmi, &data))
 		/* soft-disable the counter */
 		;
 
-- 
cgit v1.2.3-70-g09d2


From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:34:59 +0200
Subject: perf_counter: Accurate period data

We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is
incorrect. When we adjust the period, it will only take effect the next
cycle but report it for the current cycle. So when we adjust the period
for every cycle, we're always wrong.

Solve this by keeping track of the last_period.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  9 ++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++---
 include/linux/perf_counter.h       |  6 ++++--
 kernel/perf_counter.c              |  9 ++++++---
 4 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5e0bf399c43..4990ce2e5f0 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	perf_disable();
 	power_pmu_read(counter);
 	left = counter->hw.sample_period;
+	counter->hw.last_period = left;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
+	counter->hw.last_period = counter->hw.sample_period;
+	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		struct perf_sample_data data = {
-			.regs = regs,
-			.addr = 0,
+			.regs	= regs,
+			.addr	= 0,
+			.period	= counter->hw.last_period,
 		};
 
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82a23d487f9..57ae1bec81b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
 	}
 
@@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 	/*
@@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 
-		/* counter overflow */
-		handled = 1;
-		inc_irq_stat(apic_perf_irqs);
+		/*
+		 * counter overflow
+		 */
+		handled		= 1;
+		data.period	= counter->hw.last_period;
+
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
@@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
 	return handled;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d8c0eb480f9..5b966472b45 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -366,6 +366,7 @@ struct hw_perf_counter {
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
+	u64				last_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 
@@ -606,8 +607,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs	*regs;
-	u64		addr;
+	struct pt_regs		*regs;
+	u64			addr;
+	u64			period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4fe85e804f4..8b89b40bd0f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2495,7 +2495,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		perf_output_put(&handle, cpu_entry);
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, counter->hw.sample_period);
+		perf_output_put(&handle, data->period);
 
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
@@ -3040,11 +3040,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_add(period, &hwc->period_left);
+		hwc->last_period = period;
 	}
 
 	atomic64_set(&hwc->prev_count, -left);
@@ -3086,8 +3088,9 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
-		.addr = addr,
+		.regs	= regs,
+		.addr	= addr,
+		.period	= counter->hw.last_period,
 	};
 
 	perf_swcounter_update(counter);
-- 
cgit v1.2.3-70-g09d2


From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:06:28 +0200
Subject: perf_counter: Standardize event names

Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   | 12 +++++------
 arch/powerpc/kernel/power5+-pmu.c  | 12 +++++------
 arch/powerpc/kernel/power5-pmu.c   | 12 +++++------
 arch/powerpc/kernel/power6-pmu.c   | 12 +++++------
 arch/powerpc/kernel/ppc970-pmu.c   | 12 +++++------
 arch/powerpc/mm/fault.c            |  6 +++---
 arch/x86/kernel/cpu/perf_counter.c | 32 +++++++++++++--------------
 arch/x86/mm/fault.c                |  6 +++---
 include/linux/perf_counter.h       | 36 +++++++++++++++----------------
 kernel/perf_counter.c              | 20 ++++++++---------
 tools/perf/builtin-record.c        |  4 ++--
 tools/perf/builtin-stat.c          | 31 ++++++++++++++-------------
 tools/perf/builtin-top.c           |  4 ++--
 tools/perf/design.txt              | 28 ++++++++++++------------
 tools/perf/util/parse-events.c     | 44 +++++++++++++++++++-------------------
 15 files changed, 136 insertions(+), 135 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 0e94b685722..73956f084b2 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int p4_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index bbf2cbb0738..5f8b7741e97 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5p_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 670cf10b91e..d54723ab627 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 4da70786609..0cd406ee765 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power6_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0x1e,
-	[PERF_COUNT_INSTRUCTIONS] = 2,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 336adf1736a..46a20640942 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int ppc970_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 1,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 1,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index ac0e112031b..5beffc8f481 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 57ae1bec81b..572fb434a66 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
  */
 static const u64 intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
-  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
 static u64 intel_pmu_event_map(int event)
@@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
 };
 
 static u64 amd_pmu_event_map(int event)
@@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6f9df2babe4..5c6d816f30b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,11 +1142,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d5911b02bc8..887df88a9c2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,15 +42,15 @@ enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
-
-	PERF_HW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,		/* non ABI */
 };
 
 /*
@@ -93,15 +93,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_SW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK	= 1,
+	PERF_COUNT_SW_PAGE_FAULTS	= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_COUNT_SW_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c02535bed26..8859b97390e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1024,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	int do_switch = 1;
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
 
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
@@ -3411,13 +3411,13 @@ void perf_counter_task_migration(struct task_struct *task, int cpu)
 	struct perf_counter_context *ctx;
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_CPU_MIGRATIONS,
+				 PERF_COUNT_SW_CPU_MIGRATIONS,
 				 1, 1, NULL, 0);
 
 	ctx = perf_pin_task_context(task);
 	if (ctx) {
 		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_CPU_MIGRATIONS,
+					 PERF_COUNT_SW_CPU_MIGRATIONS,
 					 1, 1, NULL, 0);
 		perf_unpin_context(ctx);
 	}
@@ -3475,11 +3475,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * events.
 	 */
 	switch (counter->attr.config) {
-	case PERF_COUNT_CPU_CLOCK:
+	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_TASK_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -3490,11 +3490,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 			pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_PAGE_FAULTS:
-	case PERF_COUNT_PAGE_FAULTS_MIN:
-	case PERF_COUNT_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_CONTEXT_SWITCHES:
-	case PERF_COUNT_CPU_MIGRATIONS:
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
 		pmu = &perf_ops_generic;
 		break;
 	}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 84cd336ae79..29259e74dcf 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -378,12 +378,12 @@ try_again:
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6404906924f..c43e4a97dc4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,15 +46,16 @@
 
 static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
-
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS	},
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES	},
+
 };
 
 static int			system_wide			=  0;
@@ -120,10 +121,10 @@ static inline int nsec_counter(int counter)
 	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
 		return 0;
 
-	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
 		return 1;
 
-	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -176,10 +177,10 @@ static void read_counter(int counter)
 	 * Save the full runtime - to allow normalization during printout:
 	 */
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		runtime_nsecs = count[0];
 	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
-		attrs[counter].config == PERF_COUNT_CPU_CYCLES)
+		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
 		runtime_cycles = count[0];
 }
 
@@ -206,7 +207,7 @@ static void print_counter(int counter)
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
 		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
+			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
 
 			if (walltime_nsecs)
 				fprintf(stderr, " # %11.3f CPU utilization factor",
@@ -220,7 +221,7 @@ static void print_counter(int counter)
 				(double)count[0]/runtime_nsecs*1000.0);
 		if (runtime_cycles &&
 			attrs[counter].type == PERF_TYPE_HARDWARE &&
-				attrs[counter].config == PERF_COUNT_INSTRUCTIONS) {
+				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
 
 			fprintf(stderr, " # %1.3f per cycle",
 				(double)count[0] / (double)runtime_cycles);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 309dbc76ec8..fe338d3c5d7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -562,13 +562,13 @@ try_again:
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index d3250763dc9..860e116d979 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -99,13 +99,13 @@ enum hw_event_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES	= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES	= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
 };
 
 These are standardized types of events that work relatively uniformly
@@ -130,13 +130,13 @@ software events, selected by 'event_id':
  * well):
  */
 enum sw_event_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f18a9a006e1..9d5f1ca50e6 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -22,26 +22,26 @@ struct event_symbol {
 #define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
-  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
-  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
-  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
-  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
-  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
-  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
-  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
-
-  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
-  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, HW_INSTRUCTIONS),	"instructions",		},
+  { C(HARDWARE, HW_CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, HW_CACHE_MISSES),	"cache-misses",		},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions",	},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches",		},
+  { C(HARDWARE, HW_BRANCH_MISSES),	"branch-misses",	},
+  { C(HARDWARE, HW_BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, SW_CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, SW_TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"page-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -107,7 +107,7 @@ char *event_name(int counter)
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (config < PERF_HW_EVENTS_MAX)
+		if (config < PERF_COUNT_HW_MAX)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
@@ -136,7 +136,7 @@ char *event_name(int counter)
 	}
 
 	case PERF_TYPE_SOFTWARE:
-		if (config < PERF_SW_EVENTS_MAX)
+		if (config < PERF_COUNT_SW_MAX)
 			return sw_event_names[config];
 		return "unknown-software";
 
-- 
cgit v1.2.3-70-g09d2


From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:19:11 +0200
Subject: perf_counter: Rename L2 to LL cache

The top (fastest) and last level (biggest) caches are the most
interesting ones, performance wise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
[ Fixed the Nehalem LL table to LLC Reference/Miss events ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   |  2 +-
 arch/powerpc/kernel/power5+-pmu.c  |  2 +-
 arch/powerpc/kernel/power5-pmu.c   |  2 +-
 arch/powerpc/kernel/power6-pmu.c   |  2 +-
 arch/powerpc/kernel/power7-pmu.c   |  2 +-
 arch/powerpc/kernel/ppc970-pmu.c   |  2 +-
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 include/linux/perf_counter.h       |  4 ++--
 8 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 73956f084b2..07bd308a5fa 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0xc34,		0	},
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 5f8b7741e97..41e5d2d958d 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0		},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d54723ab627..05600b66221 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0x3c309b	},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 0cd406ee765..46f74bebcfd 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0x4008c,	0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x150730,	0x250532	},
 		[C(OP_WRITE)] = {	0x250432,	0x150432	},
 		[C(OP_PREFETCH)] = {	0x810a6,	0		},
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 060e0deb399..b3f7d1216ba 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0x408a,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x6080,		0x6084	},
 		[C(OP_WRITE)] = {	0x6082,		0x6086	},
 		[C(OP_PREFETCH)] = {	0,		0	},
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 46a20640942..ba0a357a89f 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0x733,		0	},
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 572fb434a66..895c82e7845 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
 		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
@@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
-		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
 	},
  },
  [ C(DTLB) ] = {
@@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 887df88a9c2..20cf5af27ad 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -56,14 +56,14 @@ enum perf_hw_id {
 /*
  * Generalized hardware cache counters:
  *
- *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
 	PERF_COUNT_HW_CACHE_L1D		= 0,
 	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_LL		= 2,
 	PERF_COUNT_HW_CACHE_DTLB	= 3,
 	PERF_COUNT_HW_CACHE_ITLB	= 4,
 	PERF_COUNT_HW_CACHE_BPU		= 5,
-- 
cgit v1.2.3-70-g09d2