diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
39 files changed, 3063 insertions, 2450 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1d2cb383410..c202b62f367 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 468489b57aa..97ad79cdf68 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, { 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c deleted file mode 100644 index b368cd86299..00000000000 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CPU x86 architecture debug code - * - * Copyright(C) 2009 Jaswinder Singh Rajput - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/interrupt.h> -#include <linux/compiler.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/smp.h> - -#include <asm/cpu_debug.h> -#include <asm/paravirt.h> -#include <asm/system.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/desc.h> - -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); -static DEFINE_PER_CPU(int, cpud_priv_count); - -static DEFINE_MUTEX(cpu_debug_lock); - -static struct dentry *cpu_debugfs_dir; - -static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC, 0 }, - { "monitor", CPU_MONITOR, 0 }, - { "time", CPU_TIME, 0 }, - { "pmc", CPU_PMC, 1 }, - { "platform", CPU_PLATFORM, 0 }, - { "apic", CPU_APIC, 0 }, - { "poweron", CPU_POWERON, 0 }, - { "control", CPU_CONTROL, 0 }, - { "features", CPU_FEATURES, 0 }, - { "lastbranch", CPU_LBRANCH, 0 }, - { "bios", CPU_BIOS, 0 }, - { "freq", CPU_FREQ, 0 }, - { "mtrr", CPU_MTRR, 0 }, - { "perf", CPU_PERF, 0 }, - { "cache", CPU_CACHE, 0 }, - { "sysenter", CPU_SYSENTER, 0 }, - { "therm", CPU_THERM, 0 }, - { "misc", CPU_MISC, 0 }, - { "debug", CPU_DEBUG, 0 }, - { "pat", CPU_PAT, 0 }, - { "vmx", CPU_VMX, 0 }, - { "call", CPU_CALL, 0 }, - { "base", CPU_BASE, 0 }, - { "ver", CPU_VER, 0 }, - { "conf", CPU_CONF, 0 }, - { "smm", CPU_SMM, 0 }, - { "svm", CPU_SVM, 0 }, - { "osvm", CPU_OSVM, 0 }, - { "tss", CPU_TSS, 0 }, - { "cr", CPU_CR, 0 }, - { "dt", CPU_DT, 0 }, - { "registers", CPU_REG_ALL, 0 }, -}; - -static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL, 0 }, - { "value", CPU_REG_ALL, 1 }, -}; - -/* CPU Registers Range */ -static struct cpu_debug_range cpu_reg_range[] = { - { 0x00000000, 0x00000001, CPU_MC, }, - { 0x00000006, 0x00000007, CPU_MONITOR, }, - { 0x00000010, 0x00000010, CPU_TIME, }, - { 0x00000011, 0x00000013, CPU_PMC, }, - { 0x00000017, 0x00000017, CPU_PLATFORM, }, - { 0x0000001B, 0x0000001B, CPU_APIC, }, - { 0x0000002A, 0x0000002B, CPU_POWERON, }, - { 0x0000002C, 0x0000002C, CPU_FREQ, }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, }, - { 0x00000040, 0x00000047, CPU_LBRANCH, }, - { 0x00000060, 0x00000067, CPU_LBRANCH, }, - { 0x00000079, 0x00000079, CPU_BIOS, }, - { 0x00000088, 0x0000008A, CPU_CACHE, }, - { 0x0000008B, 0x0000008B, CPU_BIOS, }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, }, - { 0x000000C1, 0x000000C4, CPU_PMC, }, - { 0x000000CD, 0x000000CD, CPU_FREQ, }, - { 0x000000E7, 0x000000E8, CPU_PERF, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, }, - - { 0x00000116, 0x0000011E, CPU_CACHE, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, }, - { 0x00000179, 0x0000017B, CPU_MC, }, - { 0x00000186, 0x00000189, CPU_PMC, }, - { 0x00000198, 0x00000199, CPU_PERF, }, - { 0x0000019A, 0x0000019A, CPU_TIME, }, - { 0x0000019B, 0x0000019D, CPU_THERM, }, - { 0x000001A0, 0x000001A0, CPU_MISC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, }, - { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, }, - { 0x00000250, 0x00000250, CPU_MTRR, }, - { 0x00000258, 0x00000259, CPU_MTRR, }, - { 0x00000268, 0x0000026F, CPU_MTRR, }, - { 0x00000277, 0x00000277, CPU_PAT, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, }, - - { 0x00000300, 0x00000311, CPU_PMC, }, - { 0x00000345, 0x00000345, CPU_PMC, }, - { 0x00000360, 0x00000371, CPU_PMC, }, - { 0x0000038D, 0x00000390, CPU_PMC, }, - { 0x000003A0, 0x000003BE, CPU_PMC, }, - { 0x000003C0, 0x000003CD, CPU_PMC, }, - { 0x000003E0, 0x000003E1, CPU_PMC, }, - { 0x000003F0, 0x000003F2, CPU_PMC, }, - - { 0x00000400, 0x00000417, CPU_MC, }, - { 0x00000480, 0x0000048B, CPU_VMX, }, - - { 0x00000600, 0x00000600, CPU_DEBUG, }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - - { 0x000107CC, 0x000107D3, CPU_PMC, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, }, - { 0xC0000081, 0xC0000084, CPU_CALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, }, - { 0xC0000103, 0xC0000103, CPU_TIME, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, }, - { 0xC0010010, 0xC0010010, CPU_CONF, }, - { 0xC0010015, 0xC0010015, CPU_CONF, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, }, - { 0xC001001F, 0xC001001F, CPU_CONF, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, }, - { 0xC0010044, 0xC0010048, CPU_MC, }, - { 0xC0010050, 0xC0010056, CPU_SMM, }, - { 0xC0010058, 0xC0010058, CPU_CONF, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, }, - { 0xC0010061, 0xC0010068, CPU_SMM, }, - { 0xC0010069, 0xC001006B, CPU_SMM, }, - { 0xC0010070, 0xC0010071, CPU_SMM, }, - { 0xC0010111, 0xC0010113, CPU_SMM, }, - { 0xC0010114, 0xC0010118, CPU_SVM, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, }, - { 0xC0011022, 0xC0011023, CPU_CONF, }, -}; - -static int is_typeflag_valid(unsigned cpu, unsigned flag) -{ - int i; - - /* Standard Registers should be always valid */ - if (flag >= CPU_TSS) - return 1; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (cpu_reg_range[i].flag == flag) - return 1; - } - - /* Invalid */ - return 0; -} - -static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, - int index, unsigned flag) -{ - if (cpu_reg_range[index].flag == flag) { - *min = cpu_reg_range[index].min; - *max = cpu_reg_range[index].max; - } else - *max = 0; - - return *max; -} - -/* This function can also be called with seq = NULL for printk */ -static void print_cpu_data(struct seq_file *seq, unsigned type, - u32 low, u32 high) -{ - struct cpu_private *priv; - u64 val = high; - - if (seq) { - priv = seq->private; - if (priv->file) { - val = (val << 32) | low; - seq_printf(seq, "0x%llx\n", val); - } else - seq_printf(seq, " %08x: %08x_%08x\n", - type, high, low); - } else - printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); -} - -/* This function can also be called with seq = NULL for printk */ -static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) -{ - unsigned msr, msr_min, msr_max; - struct cpu_private *priv; - u32 low, high; - int i; - - if (seq) { - priv = seq->private; - if (priv->file) { - if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, - &low, &high)) - print_cpu_data(seq, priv->reg, low, high); - return; - } - } - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) - continue; - - for (msr = msr_min; msr <= msr_max; msr++) { - if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) - continue; - print_cpu_data(seq, msr, low, high); - } - } -} - -static void print_tss(void *arg) -{ - struct pt_regs *regs = task_pt_regs(current); - struct seq_file *seq = arg; - unsigned int seg; - - seq_printf(seq, " RAX\t: %016lx\n", regs->ax); - seq_printf(seq, " RBX\t: %016lx\n", regs->bx); - seq_printf(seq, " RCX\t: %016lx\n", regs->cx); - seq_printf(seq, " RDX\t: %016lx\n", regs->dx); - - seq_printf(seq, " RSI\t: %016lx\n", regs->si); - seq_printf(seq, " RDI\t: %016lx\n", regs->di); - seq_printf(seq, " RBP\t: %016lx\n", regs->bp); - seq_printf(seq, " ESP\t: %016lx\n", regs->sp); - -#ifdef CONFIG_X86_64 - seq_printf(seq, " R08\t: %016lx\n", regs->r8); - seq_printf(seq, " R09\t: %016lx\n", regs->r9); - seq_printf(seq, " R10\t: %016lx\n", regs->r10); - seq_printf(seq, " R11\t: %016lx\n", regs->r11); - seq_printf(seq, " R12\t: %016lx\n", regs->r12); - seq_printf(seq, " R13\t: %016lx\n", regs->r13); - seq_printf(seq, " R14\t: %016lx\n", regs->r14); - seq_printf(seq, " R15\t: %016lx\n", regs->r15); -#endif - - asm("movl %%cs,%0" : "=r" (seg)); - seq_printf(seq, " CS\t: %04x\n", seg); - asm("movl %%ds,%0" : "=r" (seg)); - seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); - asm("movl %%es,%0" : "=r" (seg)); - seq_printf(seq, " ES\t: %04x\n", seg); - asm("movl %%fs,%0" : "=r" (seg)); - seq_printf(seq, " FS\t: %04x\n", seg); - asm("movl %%gs,%0" : "=r" (seg)); - seq_printf(seq, " GS\t: %04x\n", seg); - - seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); - - seq_printf(seq, " EIP\t: %016lx\n", regs->ip); -} - -static void print_cr(void *arg) -{ - struct seq_file *seq = arg; - - seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); - seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); - seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); - seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); -#ifdef CONFIG_X86_64 - seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); -#endif -} - -static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) -{ - seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); -} - -static void print_dt(void *seq) -{ - struct desc_ptr dt; - unsigned long ldt; - - /* IDT */ - store_idt((struct desc_ptr *)&dt); - print_desc_ptr("IDT", seq, dt); - - /* GDT */ - store_gdt((struct desc_ptr *)&dt); - print_desc_ptr("GDT", seq, dt); - - /* LDT */ - store_ldt(ldt); - seq_printf(seq, " LDT\t: %016lx\n", ldt); - - /* TR */ - store_tr(ldt); - seq_printf(seq, " TR\t: %016lx\n", ldt); -} - -static void print_dr(void *arg) -{ - struct seq_file *seq = arg; - unsigned long dr; - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - get_debugreg(dr, i); - seq_printf(seq, " dr%d\t: %016lx\n", i, dr); - } - - seq_printf(seq, "\n MSR\t:\n"); -} - -static void print_apic(void *arg) -{ - struct seq_file *seq = arg; - -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(seq, " LAPIC\t:\n"); - seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); - seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); - seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); - seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); - seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); - seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); - seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); - seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); - seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); - seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); - seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); - seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); - seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); - seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); - seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); - seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); - seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); - seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); - seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); - seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); - seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - unsigned int i, v, maxeilvt; - - v = apic_read(APIC_EFEAT); - maxeilvt = (v >> 16) & 0xff; - seq_printf(seq, " EFEAT\t\t: %08x\n", v); - seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); - - for (i = 0; i < maxeilvt; i++) { - v = apic_read(APIC_EILVTn(i)); - seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); - } - } -#endif /* CONFIG_X86_LOCAL_APIC */ - seq_printf(seq, "\n MSR\t:\n"); -} - -static int cpu_seq_show(struct seq_file *seq, void *v) -{ - struct cpu_private *priv = seq->private; - - if (priv == NULL) - return -EINVAL; - - switch (cpu_base[priv->type].flag) { - case CPU_TSS: - smp_call_function_single(priv->cpu, print_tss, seq, 1); - break; - case CPU_CR: - smp_call_function_single(priv->cpu, print_cr, seq, 1); - break; - case CPU_DT: - smp_call_function_single(priv->cpu, print_dt, seq, 1); - break; - case CPU_DEBUG: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_dr, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - case CPU_APIC: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_apic, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - - default: - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - } - seq_printf(seq, "\n"); - - return 0; -} - -static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos == 0) /* One time is enough ;-) */ - return seq; - - return NULL; -} - -static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - - return cpu_seq_start(seq, pos); -} - -static void cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations cpu_seq_ops = { - .start = cpu_seq_start, - .next = cpu_seq_next, - .stop = cpu_seq_stop, - .show = cpu_seq_show, -}; - -static int cpu_seq_open(struct inode *inode, struct file *file) -{ - struct cpu_private *priv = inode->i_private; - struct seq_file *seq; - int err; - - err = seq_open(file, &cpu_seq_ops); - if (!err) { - seq = file->private_data; - seq->private = priv; - } - - return err; -} - -static int write_msr(struct cpu_private *priv, u64 val) -{ - u32 low, high; - - high = (val >> 32) & 0xffffffff; - low = val & 0xffffffff; - - if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) - return 0; - - return -EPERM; -} - -static int write_cpu_register(struct cpu_private *priv, const char *buf) -{ - int ret = -EPERM; - u64 val; - - ret = strict_strtoull(buf, 0, &val); - if (ret < 0) - return ret; - - /* Supporting only MSRs */ - if (priv->type < CPU_TSS_BIT) - return write_msr(priv, val); - - return ret; -} - -static ssize_t cpu_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct cpu_private *priv = seq->private; - char buf[19]; - - if ((priv == NULL) || (count >= sizeof(buf))) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) - if (!write_cpu_register(priv, buf)) - return count; - - return -EACCES; -} - -static const struct file_operations cpu_fops = { - .owner = THIS_MODULE, - .open = cpu_seq_open, - .read = seq_read, - .write = cpu_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, - unsigned file, struct dentry *dentry) -{ - struct cpu_private *priv = NULL; - - /* Already intialized */ - if (file == CPU_INDEX_BIT) - if (per_cpu(cpud_arr[type].init, cpu)) - return 0; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv == NULL) - return -ENOMEM; - - priv->cpu = cpu; - priv->type = type; - priv->reg = reg; - priv->file = file; - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_priv_arr[type], cpu) = priv; - per_cpu(cpud_priv_count, cpu)++; - mutex_unlock(&cpu_debug_lock); - - if (file) - debugfs_create_file(cpu_file[file].name, S_IRUGO, - dentry, (void *)priv, &cpu_fops); - else { - debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpud_arr[type].dentry, cpu), - (void *)priv, &cpu_fops); - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_arr[type].init, cpu) = 1; - mutex_unlock(&cpu_debug_lock); - } - - return 0; -} - -static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, - struct dentry *dentry) -{ - unsigned file; - int err = 0; - - for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { - err = cpu_create_file(cpu, type, reg, file, dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned reg, reg_min, reg_max; - int i, err = 0; - char reg_dir[12]; - u32 low, high; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, ®_min, ®_max, i, - cpu_base[type].flag)) - continue; - - for (reg = reg_min; reg <= reg_max; reg++) { - if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) - continue; - - sprintf(reg_dir, "0x%x", reg); - cpu_dentry = debugfs_create_dir(reg_dir, dentry); - err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); - if (err) - return err; - } - } - - return err; -} - -static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned type; - int err = 0; - - for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { - if (!is_typeflag_valid(cpu, cpu_base[type].flag)) - continue; - cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; - - if (type < CPU_TSS_BIT) - err = cpu_init_msr(cpu, type, cpu_dentry); - else - err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, - cpu_dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_cpu(void) -{ - struct dentry *cpu_dentry = NULL; - struct cpuinfo_x86 *cpui; - char cpu_dir[12]; - unsigned cpu; - int err = 0; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - cpui = &cpu_data(cpu); - if (!cpu_has(cpui, X86_FEATURE_MSR)) - continue; - - sprintf(cpu_dir, "cpu%d", cpu); - cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); - err = cpu_init_allreg(cpu, cpu_dentry); - - pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); - if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { - pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; - err = -ENFILE; - } - if (err) - return err; - } - - return err; -} - -static int __init cpu_debug_init(void) -{ - cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); - - return cpu_init_cpu(); -} - -static void __exit cpu_debug_exit(void) -{ - int i, cpu; - - if (cpu_debugfs_dir) - debugfs_remove_recursive(cpu_debugfs_dir); - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) - kfree(per_cpu(cpud_priv_arr[i], cpu)); -} - -module_init(cpu_debug_init); -module_exit(cpu_debug_exit); - -MODULE_AUTHOR("Jaswinder Singh Rajput"); -MODULE_DESCRIPTION("CPU Debug module"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c389b..870e6cc6ad2 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + <file:Documentation/cpu-freq/pcc-cpufreq.txt>. + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296df294..1840c0a5170 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1b1920fa7c8..459168083b7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <trace/events/power.h> #include <linux/acpi.h> diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 006b278b0d5..c587db472a7 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -20,7 +20,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/cpufreq.h> diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index ac27ec2264d..16e3483be9e 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -80,6 +80,7 @@ #include <linux/cpufreq.h> #include <linux/pci.h> #include <linux/errno.h> +#include <linux/slab.h> #include <asm/processor-cyrix.h> diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index da5f70fcb76..e7b559d74c5 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/cpufreq.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 86961519372..7b8a8ba67b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/cpumask.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 00000000000..ce7cde713e7 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,621 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/cpufreq.h> +#include <linux/compiler.h> +#include <linux/slab.h> + +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> + +#include <acpi/processor.h> + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto pcch_null; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto free; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); + + return 0; +free: + pcc_clear_mapping(); + free_percpu(pcc_cpu_info); +pcch_null: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index cb01dac267d..b3379d6a5c5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -13,7 +13,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/ioport.h> -#include <linux/slab.h> #include <linux/timex.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f125e5c551c..d360b56e982 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data) static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - acpi_integer control; + u64 control; if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; @@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer control, status; + u64 control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, u32 fid; u32 vid; u32 freq, index; - acpi_integer status, control; + u64 status, control; if (data->exttype) { status = data->acpi_data.states[i].status; @@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu) int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8d672ef162c..9b1ff37de46 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -20,6 +20,7 @@ #include <linux/sched.h> /* current */ #include <linux/delay.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <asm/msr.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2ce8e0b5cc5..561758e9518 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <linux/slab.h> #include <linux/sched.h> #include "speedstep-lib.h" diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index ad0083abfa2..a94ec6be69f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -13,7 +13,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <asm/msr.h> #include <asm/tsc.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 04d73c114e4..8abd869baab 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -17,7 +17,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/io.h> #include <asm/ist.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 879666f4d87..7e1cca13af3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - sched_clock_stable = 1; + if (!check_tsc_unstable()) + sched_clock_stable = 1; } /* diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c2b722d5a72..b3eeb66c0a5 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <linux/smp.h> #include <asm/k8.h> +#include <asm/smp.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -152,7 +153,8 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -162,7 +164,8 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; }; unsigned short num_cache_leaves; @@ -292,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#ifdef CONFIG_CPU_SUP_AMD +static unsigned int __cpuinit amd_calc_l3_indices(void) +{ + /* + * We're called over smp_call_function_single() and therefore + * are on the correct cpu. + */ + int cpu = smp_processor_id(); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(dev, 0x1C4, &val); + + /* calculate subcache sizes */ + sc0 = !(val & BIT(0)); + sc1 = !(val & BIT(4)); + sc2 = !(val & BIT(8)) + !(val & BIT(9)); + sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { @@ -301,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) if (boot_cpu_data.x86 == 0x11) return; - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + /* see errata #382 and #388 */ + if ((boot_cpu_data.x86 == 0x10) && + ((boot_cpu_data.x86_model < 0x8) || + (boot_cpu_data.x86_mask < 0x1))) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = true; + this_leaf->l3_indices = amd_calc_l3_indices(); +} + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!dev) + return -EINVAL; + + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "0x%08x\n", reg); +} + +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + return -EINVAL; + + val |= BIT(30); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + return count; +} + +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +#else /* CONFIG_CPU_SUP_AMD */ +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +{ +}; +#endif /* CONFIG_CPU_SUP_AMD */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -713,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - unsigned int scrubber = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); - - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -803,23 +851,28 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); +#define DEFAULT_SYSFS_CACHE_ATTRS \ + &type.attr, \ + &level.attr, \ + &coherency_line_size.attr, \ + &physical_line_partition.attr, \ + &ways_of_associativity.attr, \ + &number_of_sets.attr, \ + &size.attr, \ + &shared_cpu_map.attr, \ + &shared_cpu_list.attr static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, + DEFAULT_SYSFS_CACHE_ATTRS, + NULL +}; + +static struct attribute *default_l3_attrs[] = { + DEFAULT_SYSFS_CACHE_ATTRS, +#ifdef CONFIG_CPU_SUP_AMD &cache_disable_0.attr, &cache_disable_1.attr, +#endif NULL }; @@ -850,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; @@ -910,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); @@ -928,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + if (this_leaf->can_disable) + ktype_cache.default_attrs = default_l3_attrs; + else + ktype_cache.default_attrs = default_attrs; + retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 73734baa50f..e7dbde7bfed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -22,6 +22,7 @@ #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513..7a355ddcc64 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -26,6 +26,7 @@ #include <linux/sched.h> #include <linux/sysfs.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/poll.h> @@ -46,6 +47,13 @@ #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + +#define rcu_dereference_check_mce(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_read_mutex)) + #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -158,7 +166,7 @@ void mce_log(struct mce *mce) mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference(mcelog.next); + entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* * When the buffer fills up discard new entries. @@ -531,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - __get_cpu_var(mce_poll_count)++; + percpu_inc(mce_poll_count); mce_setup(&m); @@ -926,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - __get_cpu_var(mce_exception_count)++; + percpu_inc(mce_exception_count); if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) @@ -1485,8 +1493,6 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } -static DEFINE_MUTEX(mce_read_mutex); - static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { @@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { @@ -1565,7 +1571,7 @@ timeout: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) + if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; return 0; } @@ -2044,6 +2050,7 @@ static __init void mce_init_banks(void) struct mce_bank *b = &mce_banks[i]; struct sysdev_attribute *a = &b->attr; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 83a3d1f4efc..224392d8fe8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -21,6 +21,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = { .show = show, .store = store, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 7c785634af2..62b48e40920 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -5,6 +5,7 @@ * Author: Andi Kleen */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> @@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) + if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; @@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) + if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); } else { diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index f4361b56f8e..ad9e5ed8118 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o cleanup.o +obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 33af14110df..92ba9cd31c9 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static struct mtrr_ops amd_mtrr_ops = { +static const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index de89f14eff3..316fe3e60a9 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static struct mtrr_ops centaur_mtrr_ops = { +static const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 09b1698e046..06130b52f01 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -22,10 +22,10 @@ #include <linux/pci.h> #include <linux/smp.h> #include <linux/cpu.h> -#include <linux/sort.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/kvm_para.h> +#include <linux/range.h> #include <asm/processor.h> #include <asm/e820.h> @@ -34,11 +34,6 @@ #include "mtrr.h" -struct res_range { - unsigned long start; - unsigned long end; -}; - struct var_mtrr_range_state { unsigned long base_pfn; unsigned long size_pfn; @@ -56,7 +51,7 @@ struct var_mtrr_state { /* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 -static struct res_range __initdata range[RANGE_NUM]; +static struct range __initdata range[RANGE_NUM]; static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; @@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - -static int __init -add_range(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - /* Out of slots: */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - int i; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -static int __init clean_sort_range(struct res_range *range, int az) -{ - int i, j, k = az - 1, nr_range = 0; - - for (i = 0; i < k; i++) { - if (range[i].end) - continue; - for (j = k; j > i; j--) { - if (range[j].end) { - k = j; - break; - } - } - if (j == i) - break; - range[i].start = range[k].start; - range[i].end = range[k].end; - range[k].start = 0; - range[k].end = 0; - k--; - } - /* count it */ - for (i = 0; i < az; i++) { - if (!range[i].end) { - nr_range = i; - break; - } - } - - /* sort them */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - return nr_range; -} - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +x86_get_mtrr_mem_range(struct range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { @@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; base = range_state[i].base_pfn; size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } /* Take out UC ranges: */ @@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size); } if (extra_remove_size) - subtract_range(range, extra_remove_base, - extra_remove_base + extra_remove_size - 1); + subtract_range(range, RANGE_NUM, extra_remove_base, + extra_remove_base + extra_remove_size); if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } } @@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - return nr_range; } #ifdef CONFIG_MTRR_SANITIZER -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +static unsigned long __init sum_ranges(struct range *range, int nr_range) { unsigned long sum = 0; int i; for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; + sum += range[i].end - range[i].start; return sum; } @@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg) early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, +x86_setup_var_mtrrs(struct range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; @@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, /* Write the range: */ for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); + range[i].end - range[i].start); } /* Write the last range: */ @@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct res_range range_new[RANGE_NUM]; + static struct range range_new[RANGE_NUM]; unsigned long range_sums_new; static int nr_range_new; int num_reg; @@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits) * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, + 1ULL<<(20 - PAGE_SHIFT)); /* Sort the ranges: */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + sort_range(range, nr_range); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", @@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) nr_range = 0; if (mtrr_tom2) { range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; nr_range++; } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); @@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, range[i+1].start); } /* Check the top: */ i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, end_pfn); if (total_trim_size) { diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982ce09..68a3343e579 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -265,7 +265,7 @@ static void cyrix_set_all(void) post_set(); } -static struct mtrr_ops cyrix_mtrr_ops = { +static const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, .set_all = cyrix_set_all, .set = cyrix_set_arr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5f68d..fd31a441c61 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -6,7 +6,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/io.h> #include <linux/mm.h> @@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); mask_lo = tmp; } } @@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) @@ -752,7 +751,7 @@ int positive_have_wrcomb(void) /* * Generic structure... */ -struct mtrr_ops generic_mtrr_ops = { +const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index e006e56f699..79289632cb2 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/slab.h> #include <linux/init.h> #define LINE_SIZE 80 diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 84e83de5457..79556bd9b60 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops *mtrr_if; +const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops *ops) +void set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index a501dee9a87..df5e41f31a2 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size, extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); -extern struct mtrr_ops generic_mtrr_ops; +extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); @@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops *ops); +extern void set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops *mtrr_if; +extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c deleted file mode 100644 index dfc80b4e6b0..00000000000 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ /dev/null @@ -1,94 +0,0 @@ -#include <linux/init.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor-cyrix.h> -#include <asm/processor-flags.h> -#include <asm/mtrr.h> -#include <asm/msr.h> - -#include "mtrr.h" - -/* Put the processor into a state where MTRRs can be safely set */ -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) -{ - unsigned int cr0; - - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if (use_intel() || is_cpu(CYRIX)) { - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & ~X86_CR4_PGE); - } - - /* - * Disable and flush caches. Note that wbinvd flushes the TLBs - * as a side-effect - */ - cr0 = read_cr0() | X86_CR0_CD; - wbinvd(); - write_cr0(cr0); - wbinvd(); - - if (use_intel()) { - /* Save MTRR state */ - rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else were excluded at the top - */ - ctxt->ccr3 = getCx86(CX86_CCR3); - } - } -} - -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) -{ - if (use_intel()) { - /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, - ctxt->deftype_hi); - } else { - if (is_cpu(CYRIX)) { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } - } -} - -/* Restore the processor after a set_mtrr_prepare */ -void set_mtrr_done(struct set_mtrr_context *ctxt) -{ - if (use_intel() || is_cpu(CYRIX)) { - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if (use_intel()) { - /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, - ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else was excluded at the top - */ - setCx86(CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); - - /* Restore value of CR4 */ - if (cpu_has_pge) - write_cr4(ctxt->cr4val); - } - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); -} diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c223b7e895d..db5bdc8addf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -7,6 +7,7 @@ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ @@ -20,12 +21,15 @@ #include <linux/kdebug.h> #include <linux/sched.h> #include <linux/uaccess.h> +#include <linux/slab.h> #include <linux/highmem.h> #include <linux/cpu.h> +#include <linux/bitops.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> +#include <asm/compat.h> static u64 perf_event_mask __read_mostly; @@ -68,26 +72,59 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; +}; + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + struct cpu_hw_events { - struct perf_event *events[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; struct debug_store *ds; -}; -struct event_constraint { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int code; + int n_events; + int n_added; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct amd_nb *amd_nb; }; -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } -#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } +#define __EVENT_CONSTRAINT(c, n, m, w) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->idxmsk[0]; (e)++) +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) + +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -98,8 +135,8 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_event *, int); - void (*disable)(struct hw_perf_event *, int); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -114,121 +151,28 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct cpu_hw_events *cpuc, - struct hw_perf_event *hwc); -}; -static struct x86_pmu x86_pmu __read_mostly; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { - .enabled = 1, -}; + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; -static const struct event_constraint *event_constraints; - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - -static const struct event_constraint intel_p6_event_constraints[] = -{ - EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT_END -}; - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); }; -static const struct event_constraint intel_core_event_constraints[] = -{ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ - EVENT_CONSTRAINT_END -}; +static struct x86_pmu x86_pmu __read_mostly; -static const struct event_constraint intel_nehalem_event_constraints[] = -{ - EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .enabled = 1, }; -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} +static int x86_perf_event_set_period(struct perf_event *event); /* * Generalized hw caching related hw_event table, filled @@ -245,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static __initconst u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_REG_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static __initconst u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ static u64 -x86_perf_event_update(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_update(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; s64 delta; if (idx == X86_PMC_IDX_FIXED_BTS) @@ -773,7 +300,7 @@ static inline bool bts_available(void) return x86_pmu.enable_bts != NULL; } -static inline void init_debug_store_on_cpu(int cpu) +static void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -785,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static inline void fini_debug_store_on_cpu(int cpu) +static void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -914,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return 0; } -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - /* * Setup the hardware configuration for a given attr_type */ @@ -988,6 +479,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1017,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } @@ -1056,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } -static void p6_pmu_disable_all(void) +static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - if (!cpuc->enabled) - return; + int idx; - cpuc->enabled = 0; - barrier(); + for (idx = 0; idx < x86_pmu.num_events; idx++) { + u64 val; - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(x86_pmu.eventsel + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(x86_pmu.eventsel + idx, val); + } } -static void intel_pmu_disable_all(void) +void hw_perf_disable(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + if (!x86_pmu_initialized()) + return; + if (!cpuc->enabled) return; + cpuc->n_added = 0; cpuc->enabled = 0; barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); + x86_pmu.disable_all(); } -static void amd_pmu_disable_all(void) +static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * events proper, so that amd_pmu_enable_event() does the - * right thing. - */ - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + + val = event->hw.config; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(x86_pmu.eventsel + idx, val); } } -void hw_perf_disable(void) +static const struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) { - if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); + return event->pmu == &pmu; } -static void p6_pmu_enable_all(void) +static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long val; + struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int i, j, w, wmax, num = 0; + struct hw_perf_event *hwc; - if (cpuc->enabled) - return; + bitmap_zero(used_mask, X86_PMC_IDX_MAX); - cpuc->enabled = 1; - barrier(); + for (i = 0; i < n; i++) { + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; + } - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} + /* + * fastpath, try to reuse previous register + */ + for (i = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; + c = constraints[i]; -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + /* never assigned */ + if (hwc->idx == -1) + break; - if (cpuc->enabled) - return; + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; - cpuc->enabled = 1; - barrier(); + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + if (i == n) + goto done; - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + /* + * begin slow path + */ - if (WARN_ON_ONCE(!event)) - return; + bitmap_zero(used_mask, X86_PMC_IDX_MAX); - intel_pmu_enable_bts(event->hw.config); - } -} + /* + * weight = number of possible counters + * + * 1 = most constrained, only works on one counter + * wmax = least constrained, works on any counter + * + * assign events to counters starting with most + * constrained events. + */ + wmax = x86_pmu.num_events; -static void amd_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int idx; + /* + * when fixed event counters are present, + * wmax is incremented by 1 to account + * for one more choice + */ + if (x86_pmu.num_events_fixed) + wmax++; - if (cpuc->enabled) - return; + for (w = 1, num = n; num && w <= wmax; w++) { + /* for each event */ + for (i = 0; num && i < n; i++) { + c = constraints[i]; + hwc = &cpuc->event_list[i]->hw; - cpuc->enabled = 1; - barrier(); + if (c->weight != w) + continue; - for (idx = 0; idx < x86_pmu.num_events; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + if (!test_bit(j, used_mask)) + break; + } - if (!test_bit(idx, cpuc->active_mask)) - continue; + if (j == X86_PMC_IDX_MAX) + break; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + __set_bit(j, used_mask); + + if (assign) + assign[i] = j; + num--; + } } +done: + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + } + } + return num ? -ENOSPC : 0; } -void hw_perf_enable(void) +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { - if (!x86_pmu_initialized()) - return; - x86_pmu.enable_all(); -} + struct perf_event *event; + int n, max_count; -static inline u64 intel_pmu_get_status(void) -{ - u64 status; + max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + /* current number of events already accepted */ + n = cpuc->n_events; - return status; -} + if (is_x86_event(leader)) { + if (n >= max_count) + return -ENOSPC; + cpuc->event_list[n] = leader; + n++; + } + if (!dogrp) + return n; -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state <= PERF_EVENT_STATE_OFF) + continue; -static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} + if (n >= max_count) + return -ENOSPC; -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); + cpuc->event_list[n] = event; + n++; + } + return n; } -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +static inline void x86_assign_hw_event(struct perf_event *event, + struct cpu_hw_events *cpuc, int i) { - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; + struct hw_perf_event *hwc = &event->hw; - mask = 0xfULL << (idx * 4); + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); + if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + } else { + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } } -static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val = P6_NOP_EVENT; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; } -static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static int x86_pmu_start(struct perf_event *event); +static void x86_pmu_stop(struct perf_event *event); + +void hw_perf_enable(void) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i; + + if (!x86_pmu_initialized()) return; - } - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); + if (cpuc->enabled) return; + + if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + * step2: reprogram moved events into new counters + */ + for (i = 0; i < n_running; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) + continue; + + x86_pmu_stop(event); + } + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (!match_prev_assignment(hwc, cpuc, i)) + x86_assign_hw_event(event, cpuc, i); + else if (i < n_running) + continue; + + x86_pmu_start(event); + } + cpuc->n_added = 0; + perf_events_lapic_init(); } - x86_pmu_disable_event(hwc, idx); + cpuc->enabled = 1; + barrier(); + + x86_pmu.enable_all(); } -static inline void -amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) { - x86_pmu_disable_event(hwc, idx); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, + hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -1275,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); * To be called with the event disabled in hw: */ static int -x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0; + int err, ret = 0, idx = hwc->idx; if (idx == X86_PMC_IDX_FIXED_BTS) return 0; @@ -1326,212 +920,63 @@ x86_perf_event_set_period(struct perf_event *event, return ret; } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); + __x86_pmu_enable_event(&event->hw); } - -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - x86_pmu_enable_event(hwc, idx); -} - -static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +/* + * activate a single event + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + * + * Called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +static int x86_pmu_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; - if (cpuc->enabled) - x86_pmu_enable_event(hwc, idx); -} - -static int fixed_mode_idx(struct hw_perf_event *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; + hwc = &event->hw; - if (!x86_pmu.num_events_fixed) - return -1; + n0 = cpuc->n_events; + n = collect_events(cpuc, event, false); + if (n < 0) + return n; + ret = x86_schedule_events(cpuc, n, assign); + if (ret) + return ret; /* - * fixed counters do not take all possible filters + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() */ - if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * generic counter allocator: get next free counter - */ -static int -gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); - return idx == x86_pmu.num_events ? -1 : idx; -} + memcpy(cpuc->assign, assign, n*sizeof(int)); -/* - * intel-specific counter allocator: check event constraints - */ -static int -intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - const struct event_constraint *event_constraint; - int i, code; + cpuc->n_events = n; + cpuc->n_added += n - n0; - if (!event_constraints) - goto skip; - - code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - - for_each_event_constraint(event_constraint, event_constraints) { - if (code == event_constraint->code) { - for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { - if (!test_and_set_bit(i, cpuc->used_mask)) - return i; - } - return -1; - } - } -skip: - return gen_get_event_idx(cpuc, hwc); -} - -static int -x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = fixed_mode_idx(hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->event_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed event, if that is already taken - * then try to get a generic event: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic event again */ - if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = x86_pmu.get_event_idx(cpuc, hwc); - if (idx == -1) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; - } - - return idx; + return 0; } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static int x86_pmu_start(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; - - idx = x86_schedule_event(cpuc, hwc); - if (idx < 0) - return idx; - - perf_events_lapic_init(); + int idx = event->hw.idx; - x86_pmu.disable(hwc, idx); + if (idx == -1) + return -EAGAIN; + x86_perf_event_set_period(event); cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); - - x86_perf_event_set_period(event, hwc, idx); - x86_pmu.enable(hwc, idx); - + __set_bit(idx, cpuc->active_mask); + x86_pmu.enable(event); perf_event_update_userpage(event); return 0; @@ -1539,14 +984,8 @@ static int x86_pmu_enable(struct perf_event *event) static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->events[hwc->idx] != event)) - return; - - x86_pmu.enable(hwc, hwc->idx); + int ret = x86_pmu_start(event); + WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1576,7 +1015,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -1600,257 +1039,50 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!event) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_event *event) +static void x86_pmu_stop(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); + if (!__test_and_clear_bit(idx, cpuc->active_mask)) + return; - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the event: - */ - barrier(); + x86_pmu.disable(event); /* * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event, hwc, idx); - - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); + x86_perf_event_update(event); cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_event_update_userpage(event); -} - -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -static int intel_pmu_save_and_restart(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - if (event->state == PERF_EVENT_STATE_ACTIVE) - intel_pmu_enable_event(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_events) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - event = cpuc->events[idx]; - hwc = &event->hw; - - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) - continue; - - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; - - if (!x86_perf_event_set_period(event, hwc, idx)) - continue; - - if (perf_event_overflow(event, 1, &data, regs)) - p6_pmu_disable_event(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; } -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) +static void x86_pmu_disable(struct perf_event *event) { - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - data.raw = NULL; - - cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } + x86_pmu_stop(event); - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) { - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); - if (!intel_pmu_save_and_restart(event)) - continue; + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; - data.period = event->hw.last_period; - - if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); + --cpuc->n_events; + break; + } } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; + perf_event_update_userpage(event); } -static int amd_pmu_handle_irq(struct pt_regs *regs) +static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -1859,8 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); @@ -1871,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; hwc = &event->hw; - val = x86_perf_event_update(event, hwc, idx); + val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; @@ -1881,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) handled = 1; data.period = event->hw.last_period; - if (!x86_perf_event_set_period(event, hwc, idx)) + if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - amd_pmu_disable_event(hwc, idx); + x86_pmu_stop(event); } if (handled) @@ -1968,193 +1199,171 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static __initconst struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_events = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, - .get_event_idx = intel_get_event_idx, -}; +static struct event_constraint unconstrained; +static struct event_constraint emptyconstraint; -static __initconst struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, - .get_event_idx = intel_get_event_idx, -}; +static struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_event, - .disable = amd_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_idx = gen_get_event_idx, -}; + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} -static __init int p6_pmu_init(void) +static int x86_event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx) { - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - event_constraints = intel_p6_event_constraints; - break; - case 9: - case 13: - /* Pentium M */ - event_constraints = intel_p6_event_constraints; - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } + int ret = 0; - x86_pmu = p6_pmu; + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = smp_processor_id(); + event->tstamp_running += event->ctx->time - event->tstamp_stopped; - return 0; + if (!is_x86_event(event)) + ret = event->pmu->enable(event); + + if (!ret && !is_software_event(event)) + cpuctx->active_oncpu++; + + if (!ret && event->attr.exclusive) + cpuctx->exclusive = 1; + + return ret; } -static __init int intel_pmu_init(void) +static void x86_event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx) { - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; + if (!is_x86_event(event)) + event->pmu->disable(event); - version = eax.split.version_id; - if (version < 2) - return -ENODEV; + event->tstamp_running -= event->ctx->time - event->tstamp_stopped; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + if (!is_software_event(event)) + cpuctx->active_oncpu--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + * + * called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +int hw_perf_group_sched_in(struct perf_event *leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *sub; + int assign[X86_PMC_IDX_MAX]; + int n0, n1, ret; + + /* n0 = total number of events */ + n0 = collect_events(cpuc, leader, true); + if (n0 < 0) + return n0; + + ret = x86_schedule_events(cpuc, n0, assign); + if (ret) + return ret; + + ret = x86_event_sched_in(leader, cpuctx); + if (ret) + return ret; + + n1 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state > PERF_EVENT_STATE_OFF) { + ret = x86_event_sched_in(sub, cpuctx); + if (ret) + goto undo; + ++n1; + } + } /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() */ - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + memcpy(cpuc->assign, assign, n0*sizeof(int)); + + cpuc->n_events = n0; + cpuc->n_added += n1; + ctx->nr_active += n1; /* - * Install the hw-cache-events table: + * 1 means successful and events are active + * This is not quite true because we defer + * actual activation until hw_perf_enable() but + * this way we* ensure caller won't try to enable + * individual events */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Core2 events, "); - event_constraints = intel_core_event_constraints; - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + return 1; +undo: + x86_event_sched_out(leader, cpuctx); + n0 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + x86_event_sched_out(sub, cpuctx); + if (++n0 == n1) + break; + } + } + return ret; +} + +#include "perf_event_amd.c" +#include "perf_event_p6.c" +#include "perf_event_intel.c" - event_constraints = intel_nehalem_event_constraints; - pr_cont("Nehalem/Corei7 events, "); +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + if (x86_pmu.cpu_prepare) + ret = x86_pmu.cpu_prepare(cpu); break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - pr_cont("Atom events, "); + case CPU_STARTING: + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); break; - } - return 0; -} -static __init int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + break; - x86_pmu = amd_pmu; + case CPU_UP_CANCELED: + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + break; - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + default: + break; + } - return 0; + return ret; } static void __init pmu_check_apic(void) @@ -2169,6 +1378,7 @@ static void __init pmu_check_apic(void) void __init init_hw_perf_events(void) { + struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -2213,6 +1423,20 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); + unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != INTEL_ARCH_FIXED_MASK) + continue; + + c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; + c->weight += x86_pmu.num_events; + } + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); @@ -2220,60 +1444,91 @@ void __init init_hw_perf_events(void) pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); pr_info("... event mask: %016Lx\n", perf_event_mask); + + perf_cpu_notifier(x86_pmu_notifier); } static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event, &event->hw, event->hw.idx); + x86_perf_event_update(event); } static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, + .start = x86_pmu_start, + .stop = x86_pmu_stop, .read = x86_pmu_read, .unthrottle = x86_pmu_unthrottle, }; -static int -validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event fake_event = event->hw; - - if (event->pmu && event->pmu != &pmu) - return 0; - - return x86_schedule_event(cpuc, &fake_event) >= 0; -} - +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ static int validate_group(struct perf_event *event) { - struct perf_event *sibling, *leader = event->group_leader; - struct cpu_hw_events fake_pmu; + struct perf_event *leader = event->group_leader; + struct cpu_hw_events *fake_cpuc; + int ret, n; - memset(&fake_pmu, 0, sizeof(fake_pmu)); + ret = -ENOMEM; + fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); + if (!fake_cpuc) + goto out; - if (!validate_event(&fake_pmu, leader)) - return -ENOSPC; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + ret = -ENOSPC; + n = collect_events(fake_cpuc, leader, true); + if (n < 0) + goto out_free; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { - if (!validate_event(&fake_pmu, sibling)) - return -ENOSPC; - } + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); + if (n < 0) + goto out_free; - if (!validate_event(&fake_pmu, event)) - return -ENOSPC; + fake_cpuc->n_events = n; - return 0; + ret = x86_schedule_events(fake_cpuc, n, NULL); + +out_free: + kfree(fake_cpuc); +out: + return ret; } const struct pmu *hw_perf_event_init(struct perf_event *event) { + const struct pmu *tmp; int err; err = __hw_perf_event_init(event); if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + if (event->group_leader != event) err = validate_group(event); + + event->pmu = tmp; } if (err) { if (event->destroy) @@ -2297,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2313,10 +1567,6 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_ignored_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name) || - x86_is_stack_id(DEBUG_STACK, name); - return 0; } @@ -2324,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_ignored_frame, smp_processor_id())) - return; - if (reliable) callchain_store(entry, addr); } @@ -2347,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } /* @@ -2385,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; + + if (!test_thread_flag(TIF_IA32)) + return 0; + + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; + + if (fp < compat_ptr(regs->sp)) + break; - return bytes == sizeof(*frame); + callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; +} +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; } +#endif static void perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -2408,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); + if (perf_callchain_user32(regs, entry)) + return; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) @@ -2433,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) is_user = user_mode(regs); - if (!current || current->pid == 0) - return; - if (is_user && current->state != TASK_RUNNING) return; @@ -2462,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void hw_perf_event_setup_online(int cpu) +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { - init_debug_store_on_cpu(cpu); + regs->ip = ip; + /* + * perf_arch_fetch_caller_regs adds another call, we need to increment + * the skip level + */ + regs->bp = rewind_frame_pointer(skip + 1); + regs->cs = __KERNEL_CS; + local_save_flags(regs->flags); } diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c new file mode 100644 index 00000000000..db6f7d4056e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -0,0 +1,422 @@ +#ifdef CONFIG_CPU_SUP_AMD + +static DEFINE_RAW_SPINLOCK(amd_nb_lock); + +static __initconst u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; +} + +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + __set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu, -1); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + nb_id = amd_get_nb_id(cpu); + WARN_ON_ONCE(nb_id == BAD_APICID); + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) + continue; + + if (nb->nb_id == nb_id) { + kfree(cpuc->amd_nb); + cpuc->amd_nb = nb; + break; + } + } + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_dead(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (cpuhw->amd_nb) { + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); + + cpuhw->amd_nb = NULL; + } + + raw_spin_unlock(&amd_nb_lock); +} + +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +}; + +static __init int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +#else /* CONFIG_CPU_SUP_AMD */ + +static int amd_pmu_init(void) +{ + return 0; +} + +#endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c new file mode 100644 index 00000000000..9c794ac8783 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -0,0 +1,980 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Intel PerfMon, used on Core and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + perf_sample_data_init(&data, 0); + + data.period = event->hw.last_period; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static inline void +intel_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + x86_perf_event_update(event); + return x86_perf_event_set_period(event); +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + perf_sample_data_init(&data, 0); + + cpuc = &__get_cpu_var(cpu_hw_events); + + intel_pmu_disable_all(); + intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) { + intel_pmu_enable_all(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + x86_pmu_stop(event); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + +done: + intel_pmu_enable_all(); + return 1; +} + +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + +static struct event_constraint * +intel_special_constraints(struct perf_event *event) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + return &bts_constraint; + } + return NULL; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_special_constraints(event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + +static __initconst struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, + .get_event_constraints = intel_get_event_constraints, + + .cpu_starting = init_debug_store_on_cpu, + .cpu_dying = fini_debug_store_on_cpu, +}; + +static __init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_core2_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: /* Atom */ + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("Atom events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; + + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + } + return 0; +} + +#else /* CONFIG_CPU_SUP_INTEL */ + +static int intel_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c new file mode 100644 index 00000000000..a330485d14d --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -0,0 +1,159 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(void) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); +} + +static void p6_pmu_enable_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); +} + +static __initconst struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, +}; + +static __init int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 898df9719af..fb329e9f849 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) return !test_bit(counter, perfctr_nmi_owner); } - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) @@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 1cbed97b59c..dfdb4dba232 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,6 +22,7 @@ */ #include <linux/dmi.h> +#include <linux/module.h> #include <asm/div64.h> #include <asm/vmware.h> #include <asm/x86_init.h> @@ -101,6 +102,7 @@ int vmware_platform(void) return 0; } +EXPORT_SYMBOL(vmware_platform); /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. |