diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
30 files changed, 1891 insertions, 958 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6042981d030..25f24dccdcf 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -15,6 +15,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += rdrand.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -28,10 +29,15 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +ifdef CONFIG_PERF_EVENTS +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +endif + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b13ed393dfc..f4773f4aae3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,7 @@ +#include <linux/export.h> #include <linux/init.h> #include <linux/bitops.h> +#include <linux/elf.h> #include <linux/mm.h> #include <linux/io.h> @@ -146,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -190,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) @@ -351,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) if (node == NUMA_NO_NODE) node = per_cpu(cpu_llc_id, cpu); + /* + * If core numbers are inconsistent, it's likely a multi-fabric platform, + * so invoke platform-specific handler + */ + if (c->phys_proc_id != node) + x86_cpuinit.fixup_cpu_id(c, node); + if (!node_online(node)) { /* * Two possibilities here: @@ -410,6 +417,34 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } +static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + } +} + static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -441,27 +476,12 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif - - /* We need to do the following only once */ - if (c != &boot_cpu_data) - return; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { - - if (c->x86 > 0x10 || - (c->x86 == 0x10 && c->x86_model >= 0x2)) { - u64 val; - - rdmsrl(MSR_K7_HWCR, val); - if (!(val & BIT(24))) - printk(KERN_WARNING FW_BUG "TSC doesn't count " - "with P0 frequency!\n"); - } - } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + #ifdef CONFIG_SMP unsigned long long value; @@ -640,6 +660,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); } } + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } #ifdef CONFIG_X86_32 @@ -679,6 +701,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, }; diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978e075..159103c0b1f 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ - if (c->x86_model >= 6 && c->x86_model <= 9) { + if (c->x86_model >= 6 && c->x86_model <= 13) { rdmsr(MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); wrmsr(MSR_VIA_FCR, lo, hi); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 62184390a60..d43cad74f16 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -15,6 +15,7 @@ #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> +#include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/sections.h> @@ -675,12 +676,13 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); setup_smep(c); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); } void __init early_cpu_init(void) @@ -760,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } setup_smep(c); @@ -857,6 +856,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + x86_init_rdrand(c); /* * Clear/Set all flags overriden by options, need do it @@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1085,6 +1087,26 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1136,6 +1158,15 @@ static void dbg_restore_debug_regs(void) #endif /* ! CONFIG_KGDB */ /* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + +/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a @@ -1203,6 +1234,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e765633f210..8bacc7826fb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,5 +1,4 @@ #ifndef ARCH_X86_CPU_H - #define ARCH_X86_CPU_H struct cpu_model_info { @@ -18,6 +17,7 @@ struct cpu_dev { struct cpu_model_info c_models[4]; void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); @@ -34,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ed6086eedf1..3e6ff6cbf42 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -47,6 +47,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { + unsigned lower_word; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* Required by the SDM */ + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * @@ -55,17 +64,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { - u32 ucode, junk; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, junk, ucode); - - if (ucode < 0x20e) { - printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); - clear_cpu_cap(c, X86_FEATURE_PSE); - } + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + c->microcode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); } #ifdef CONFIG_X86_64 @@ -179,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -196,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c105c533ed9..6b45e5e7a90 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,28 +151,17 @@ union _cpuid4_leaf_ecx { u32 full; }; -struct amd_l3_cache { - struct amd_northbridge *nb; - unsigned indices; - u8 subcaches[4]; -}; - -struct _cpuid4_info { +struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - struct amd_l3_cache *l3; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); + struct amd_northbridge *nb; }; -/* subset of above _cpuid4_info w/o shared_cpu_map */ -struct _cpuid4_info_regs { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned long size; - struct amd_l3_cache *l3; +struct _cpuid4_info { + struct _cpuid4_info_regs base; + DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; unsigned short num_cache_leaves; @@ -314,16 +303,23 @@ struct _cache_attr { /* * L3 cache descriptors */ -static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) +static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) { + struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->nb->misc, 0x1C4, &val); + pci_read_config_dword(nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); @@ -333,33 +329,16 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { - static struct amd_l3_cache *__cpuinitdata l3_caches; int node; /* only for L3, and not in virtualized environments */ - if (index < 3 || amd_nb_num() == 0) + if (index < 3) return; - /* - * Strictly speaking, the amount in @size below is leaked since it is - * never freed but this is done only on shutdown so it doesn't matter. - */ - if (!l3_caches) { - int size = amd_nb_num() * sizeof(struct amd_l3_cache); - - l3_caches = kzalloc(size, GFP_ATOMIC); - if (!l3_caches) - return; - } - node = amd_get_nb_id(smp_processor_id()); - - if (!l3_caches[node].nb) { - l3_caches[node].nb = node_to_amd_nb(node); - amd_calc_l3_indices(&l3_caches[node]); - } - - this_leaf->l3 = &l3_caches[node]; + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); } /* @@ -369,11 +348,11 @@ static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -387,11 +366,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; - index = amd_get_l3_disable_slot(this_leaf->l3, slot); + index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -408,7 +386,7 @@ show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ SHOW_CACHE_DISABLE(0) SHOW_CACHE_DISABLE(1) -static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long idx) { int i; @@ -421,10 +399,10 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, for (i = 0; i < 4; i++) { u32 reg = idx | (i << 20); - if (!l3->subcaches[i]) + if (!nb->l3_cache.subcaches[i]) continue; - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -434,7 +412,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); } } @@ -448,24 +426,24 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, +int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long index) { int ret = 0; /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(l3, slot); + ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) return -EINVAL; - if (index > l3->indices) + if (index > nb->l3_cache.indices) return -EINVAL; /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index == amd_get_l3_disable_slot(nb, !slot)) return -EINVAL; - amd_l3_disable_index(l3, cpu, slot, index); + amd_l3_disable_index(nb, cpu, slot, index); return 0; } @@ -480,8 +458,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -489,7 +466,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) printk(KERN_WARNING "L3 disable slot %d in use!\n", @@ -518,7 +495,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, static ssize_t show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); @@ -533,7 +510,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; if (strict_strtoul(buf, 16, &val) < 0) @@ -769,7 +746,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) return; } this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; if (num_threads_sharing == 1) cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); @@ -820,29 +797,19 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(ici_cpuid4_info, cpu)->l3); kfree(per_cpu(ici_cpuid4_info, cpu)); per_cpu(ici_cpuid4_info, cpu) = NULL; } -static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) -{ - struct _cpuid4_info_regs *leaf_regs = - (struct _cpuid4_info_regs *)this_leaf; - - return cpuid4_cache_lookup_regs(index, leaf_regs); -} - static void __cpuinit get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf; - this_leaf = CPUID4_INFO_IDX(cpu, j); - *retval = cpuid4_cache_lookup(j, this_leaf); + struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + + *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); if (unlikely(*retval < 0)) { int i; @@ -877,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) #include <linux/kobject.h> #include <linux/sysfs.h> - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include <linux/cpu.h> /* pointer to kobject for cpuX/cache */ static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -900,16 +866,16 @@ static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } -show_one_plus(level, eax.split.level, 0); -show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); +show_one_plus(level, base.eax.split.level, 0); +show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - return sprintf(buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -946,7 +912,7 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - switch (this_leaf->eax.split.type) { + switch (this_leaf->base.eax.split.type) { case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); case CACHE_TYPE_INST: @@ -1106,9 +1072,9 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i, j; struct _index_kobject *this_object; struct _cpuid4_info *this_leaf; @@ -1120,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, - &sys_dev->kobj, "%s", "cache"); + &dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -1135,7 +1101,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) ktype_cache.default_attrs = default_attrs; #ifdef CONFIG_AMD_NB - if (this_leaf->l3) + if (this_leaf->base.nb) ktype_cache.default_attrs = amd_l3_attrs(); #endif retval = kobject_init_and_add(&(this_object->kobj), @@ -1157,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return 0; } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i; if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1178,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); + cache_add_dev(dev); break; case CPU_DEAD: case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); + cache_remove_dev(dev); break; } return NOTIFY_OK; @@ -1207,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void) for_each_online_cpu(i) { int err; - struct sys_device *sys_dev = get_cpu_sysdev(i); + struct device *dev = get_cpu_device(i); - err = cache_add_dev(sys_dev); + err = cache_add_dev(dev); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83930deec3c..507ea58688e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -28,6 +28,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/export.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/cper.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 0ed633c5048..fc4beb39357 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/fs.h> +#include <linux/preempt.h> #include <linux/smp.h> #include <linux/notifier.h> #include <linux/kdebug.h> @@ -78,26 +79,31 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) static cpumask_var_t mce_inject_cpumask; -static int mce_raise_notify(struct notifier_block *self, - unsigned long val, void *data) +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) - return NOTIFY_DONE; + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) - raise_exception(m, args->regs); + raise_exception(m, regs); else if (m->status) raise_poll(m); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block mce_raise_nb = { - .notifier_call = mce_raise_notify, - .priority = NMI_LOCAL_NORMAL_PRIOR, -}; +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} /* Inject mce on current CPU */ static int raise_local(void) @@ -146,9 +152,10 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & MCJ_NMI_BROADCAST) { + if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; + get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -158,13 +165,25 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) - apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BRAODCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject NMI %lx\n", + "Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } @@ -215,8 +234,9 @@ static int inject_init(void) if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); - mce_chrdev_ops.write = mce_write; - register_die_notifier(&mce_raise_nb); + register_mce_write_callback(mce_write); + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, + "mce_notify"); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69ee8b..ed44c8a6585 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include <linux/sysdev.h> +#include <linux/device.h> #include <asm/mce.h> enum severity_level { @@ -17,7 +17,7 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 08363b04212..29ba3297e48 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@ #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/string.h> -#include <linux/sysdev.h> +#include <linux/device.h> #include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> @@ -36,8 +36,8 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> -#include <linux/edac_mce.h> #include <linux/irq_work.h> +#include <linux/export.h> #include <asm/processor.h> #include <asm/mce.h> @@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -119,9 +118,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -144,23 +141,20 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; + mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference_check_mce(mcelog.next); for (;;) { - /* - * If edac_mce is enabled, it will check the error type - * and will process it, if it is a known error. - * Otherwise, the error will be sent through mcelog - * interface - */ - if (edac_mce_parse(mce)) - return; /* * When the buffer fills up discard new entries. @@ -193,6 +187,57 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = rcu_dereference_check_mce(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("MCE: skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { int ret = 0; @@ -217,8 +262,13 @@ static void print_mce(struct mce *m) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); /* * Print out human-readable details about the MCE error, @@ -551,10 +601,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) mce_log(&m); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - } /* * Clear state for this bank. @@ -908,9 +956,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) percpu_inc(mce_exception_count); - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out; if (!banks) goto out; @@ -1140,6 +1185,15 @@ static void mce_start_timer(unsigned long data) add_timer_on(t, smp_processor_id()); } +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + static void mce_do_trigger(struct work_struct *work) { call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); @@ -1628,16 +1682,35 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, } } -/* Modified in mce-inject.c, so not static or const */ -struct file_operations mce_chrdev_ops = { +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { .open = mce_chrdev_open, .release = mce_chrdev_release, .read = mce_chrdev_read, + .write = mce_chrdev_write, .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, .llseek = no_llseek, }; -EXPORT_SYMBOL_GPL(mce_chrdev_ops); static struct miscdevice mce_chrdev_device = { MISC_MCELOG_MINOR, @@ -1745,12 +1818,11 @@ static struct syscore_ops mce_syscore_ops = { }; /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support */ static void mce_cpu_restart(void *data) { - del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); @@ -1760,16 +1832,15 @@ static void mce_cpu_restart(void *data) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { + mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); } /* Toggle features for corrected errors */ -static void mce_disable_ce(void *all) +static void mce_disable_cmci(void *data) { if (!mce_available(__this_cpu_ptr(&cpu_info))) return; - if (all) - del_timer_sync(&__get_cpu_var(mce_timer)); cmci_clear(); } @@ -1783,27 +1854,28 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +DEFINE_PER_CPU(struct device, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1818,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1840,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1852,7 +1924,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, if (mce_ignore_ce ^ !!new) { if (new) { /* disable ce features */ - on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); mce_ignore_ce = 1; } else { /* enable ce features */ @@ -1863,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1875,7 +1948,7 @@ static ssize_t set_cmci_disabled(struct sys_device *s, if (mce_cmci_disabled ^ !!new) { if (new) { /* disable cmci */ - on_each_cpu(mce_disable_ce, NULL, 1); + on_each_cpu(mce_disable_cmci, NULL, 1); mce_cmci_disabled = 1; } else { /* enable cmci */ @@ -1886,108 +1959,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s, return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), &mce_ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), &mce_cmci_disabled }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&sysdev->kobj, 0, sizeof(struct kobject)); - sysdev->id = cpu; - sysdev->cls = &mce_sysdev_class; + memset(dev, 0, sizeof(struct device)); + dev->id = cpu; + dev->bus = &mce_subsys; - err = sysdev_register(sysdev); + err = device_register(dev); if (err) return err; - for (i = 0; mce_sysdev_attrs[i]; i++) { - err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } for (j = 0; j < banks; j++) { - err = sysdev_create_file(sysdev, &mce_banks[j].attr); + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_sysdev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); return 0; error2: while (--j >= 0) - sysdev_remove_file(sysdev, &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(sysdev); + device_unregister(dev); return err; } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_sysdev_attrs[i]; i++) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(sysdev, &mce_banks[i].attr); + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(sysdev); - cpumask_clear_cpu(cpu, mce_sysdev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ @@ -2037,7 +2109,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - mce_sysdev_create(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; @@ -2045,7 +2117,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_sysdev_remove(cpu); + mce_device_remove(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -2079,7 +2151,7 @@ static __init void mce_init_banks(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2099,16 +2171,16 @@ static __init int mcheck_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); mce_init_banks(); - err = sysdev_class_register(&mce_sysdev_class); + err = subsys_system_register(&mce_subsys, NULL); if (err) return err; for_each_online_cpu(i) { - err = mce_sysdev_create(i); + err = mce_device_create(i); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f5474218cff..ba0b94a7e20 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@ #include <linux/notifier.h> #include <linux/kobject.h> #include <linux/percpu.h> -#include <linux/sysdev.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> @@ -64,11 +63,9 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; -#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP if (shared_bank[bank] && c->cpu_core_id) break; -#endif + offset = setup_APIC_mce(offset, (high & MASK_LVTOFF_HI) >> 20); @@ -531,7 +527,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -548,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, b->kobj, name); if (err) goto out; @@ -558,7 +553,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { @@ -571,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); if (!b->kobj) goto out_free; @@ -591,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, + err = sysfs_create_link(&per_cpu(mce_device, i).kobj, b->kobj, name); if (err) goto out; @@ -669,7 +663,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -681,7 +675,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 8694ef56459..38e49bc95ff 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -28,7 +28,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_SPINLOCK(cmci_discover_lock); +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 @@ -85,7 +85,7 @@ static void cmci_discover(int banks, int boot) int hdr = 0; int i; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; @@ -116,7 +116,7 @@ static void cmci_discover(int banks, int boot) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (hdr) printk(KERN_CONT "\n"); } @@ -150,7 +150,7 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; @@ -160,7 +160,7 @@ void cmci_clear(void) wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } /* diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 27c625178bf..67bb17a37a0 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -18,7 +18,7 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/percpu.h> -#include <linux/sysdev.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/init.h> #include <linux/smp.h> @@ -68,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -94,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -222,36 +222,36 @@ static int thresh_event_valid(int event) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -264,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); + err = thermal_throttle_add_dev(dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -282,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); + thermal_throttle_remove_dev(dev); mutex_unlock(&therm_cpu_lock); break; } @@ -309,7 +309,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU @@ -322,17 +322,6 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED (0) -#define CORE_POWER_LIMIT ((__u64)1 << 62) -#define PACKAGE_THROTTLED ((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) - static void notify_thresholds(__u64 msr_val) { /* check whether the interrupt handler is defined; @@ -362,27 +351,23 @@ static void intel_thermal_interrupt(void) if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + mce_log_therm_throt_event(msr_val); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + PACKAGE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_POWER_LIMIT - | msr_val); + PACKAGE_LEVEL); } } @@ -396,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); irq_exit(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2909c..aa578cadb94 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void smp_threshold_interrupt(void) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d944bf6c50e..0a630dd4b62 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -11,6 +11,8 @@ */ #include <linux/types.h> +#include <linux/time.h> +#include <linux/clocksource.h> #include <linux/module.h> #include <asm/processor.h> #include <asm/hypervisor.h> @@ -36,6 +38,25 @@ static bool __init ms_hyperv_platform(void) !memcmp("Microsoft Hv", hyp_signature, 12); } +static cycle_t read_hv_clock(struct clocksource *arg) +{ + cycle_t current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs = { + .name = "hyperv_clocksource", + .rating = 400, /* use this when running on Hyperv*/ + .read = read_hv_clock, + .mask = CLOCKSOURCE_MASK(64), +}; + static void __init ms_hyperv_init_platform(void) { /* @@ -46,6 +67,8 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a71efcdbb09..97b26356e9e 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -547,6 +547,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, if (tmp != mask_lo) { printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND); mask_lo = tmp; } } @@ -693,6 +694,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 08119a37e53..6b96110bb0c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -149,7 +149,6 @@ struct set_mtrr_data { */ static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; /* @@ -171,7 +170,6 @@ static int mtrr_rendezvous_handler(void *info) } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } -#endif return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf20ed..5adce1040b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,8 @@ #include <asm/smp.h> #include <asm/alternative.h> +#include "perf_event.h" + #if 0 #undef wrmsrl #define wrmsrl(msr, val) \ @@ -43,283 +45,17 @@ do { \ } while (0) #endif -/* - * | NHM/WSM | SNB | - * register ------------------------------- - * | HT | no HT | HT | no HT | - *----------------------------------------- - * offcore | core | core | cpu | core | - * lbr_sel | core | core | cpu | core | - * ld_lat | cpu | core | cpu | core | - *----------------------------------------- - * - * Given that there is a small number of shared regs, - * we can pre-allocate their slot in the per-cpu - * per-core reg tables. - */ -enum extra_reg_type { - EXTRA_REG_NONE = -1, /* not used */ - - EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ - EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ - - EXTRA_REG_MAX /* number of entries needed */ -}; - -struct event_constraint { - union { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64; - }; - u64 code; - u64 cmask; - int weight; -}; - -struct amd_nb { - int nb_id; /* NorthBridge id */ - int refcnt; /* reference count */ - struct perf_event *owners[X86_PMC_IDX_MAX]; - struct event_constraint event_constraints[X86_PMC_IDX_MAX]; -}; - -struct intel_percore; - -#define MAX_LBR_ENTRIES 16 - -struct cpu_hw_events { - /* - * Generic x86 PMC bits - */ - struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int enabled; - - int n_events; - int n_added; - int n_txn; - int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ - u64 tags[X86_PMC_IDX_MAX]; - struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ - - unsigned int group_flag; - - /* - * Intel DebugStore bits - */ - struct debug_store *ds; - u64 pebs_enabled; - - /* - * Intel LBR bits - */ - int lbr_users; - void *lbr_context; - struct perf_branch_stack lbr_stack; - struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - - /* - * manage shared (per-core, per-cpu) registers - * used on Intel NHM/WSM/SNB - */ - struct intel_shared_regs *shared_regs; - - /* - * AMD specific bits - */ - struct amd_nb *amd_nb; -}; - -#define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64 = (n) }, \ - .code = (c), \ - .cmask = (m), \ - .weight = (w), \ -} - -#define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) - -/* - * Constraint on the Event code. - */ -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) - -/* - * Constraint on the Event code + UMask + fixed-mask - * - * filter mask to validate fixed counter events. - * the following filters disqualify for fixed counters: - * - inv - * - edge - * - cnt-mask - * The other filters are supported by fixed counters. - * The any-thread option is supported starting with v3. - */ -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) - -/* - * Constraint on the Event code + UMask - */ -#define INTEL_UEVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) - -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->weight; (e)++) - -/* - * Per register state. - */ -struct er_account { - raw_spinlock_t lock; /* per-core: protect structure */ - u64 config; /* extra MSR config */ - u64 reg; /* extra MSR number */ - atomic_t ref; /* reference count */ -}; - -/* - * Extra registers for specific events. - * - * Some events need large masks and require external MSRs. - * Those extra MSRs end up being shared for all events on - * a PMU and sometimes between PMU of sibling HT threads. - * In either case, the kernel needs to handle conflicting - * accesses to those extra, shared, regs. The data structure - * to manage those registers is stored in cpu_hw_event. - */ -struct extra_reg { - unsigned int event; - unsigned int msr; - u64 config_mask; - u64 valid_mask; - int idx; /* per_xxx->regs[] reg index */ -}; +struct x86_pmu x86_pmu __read_mostly; -#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ - .event = (e), \ - .msr = (ms), \ - .config_mask = (m), \ - .valid_mask = (vm), \ - .idx = EXTRA_REG_##i \ - } - -#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) - -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) - -union perf_capabilities { - struct { - u64 lbr_format : 6; - u64 pebs_trap : 1; - u64 pebs_arch_reg : 1; - u64 pebs_format : 4; - u64 smm_freeze : 1; - }; - u64 capabilities; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - /* - * Generic x86 PMC bits - */ - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(int added); - void (*enable)(struct perf_event *); - void (*disable)(struct perf_event *); - int (*hw_config)(struct perf_event *event); - int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - int max_events; - int num_counters; - int num_counters_fixed; - int cntval_bits; - u64 cntval_mask; - int apic; - u64 max_period; - struct event_constraint * - (*get_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - - void (*put_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - struct event_constraint *event_constraints; - void (*quirks)(void); - int perfctr_second_write; - - int (*cpu_prepare)(int cpu); - void (*cpu_starting)(int cpu); - void (*cpu_dying)(int cpu); - void (*cpu_dead)(int cpu); - - /* - * Intel Arch Perfmon v2+ - */ - u64 intel_ctrl; - union perf_capabilities intel_cap; - - /* - * Intel DebugStore bits - */ - int bts, pebs; - int bts_active, pebs_active; - int pebs_record_size; - void (*drain_pebs)(struct pt_regs *regs); - struct event_constraint *pebs_constraints; - - /* - * Intel LBR - */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ - - /* - * Extra registers for events - */ - struct extra_reg *extra_regs; - unsigned int er_flags; -}; - -#define ERF_NO_HT_SHARING 1 -#define ERF_HAS_RSP_1 2 - -static struct x86_pmu x86_pmu __read_mostly; - -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static int x86_perf_event_set_period(struct perf_event *event); - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids +u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static u64 __read_mostly hw_cache_extra_regs +u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; @@ -329,8 +65,7 @@ static u64 __read_mostly hw_cache_extra_regs * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ -static u64 -x86_perf_event_update(struct perf_event *event) +u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; @@ -373,30 +108,6 @@ again: return new_raw_count; } -static inline int x86_pmu_addr_offset(int index) -{ - int offset; - - /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ - alternative_io(ASM_NOP2, - "shll $1, %%eax", - X86_FEATURE_PERFCTR_CORE, - "=a" (offset), - "a" (index)); - - return offset; -} - -static inline unsigned int x86_pmu_config_addr(int index) -{ - return x86_pmu.eventsel + x86_pmu_addr_offset(index); -} - -static inline unsigned int x86_pmu_event_addr(int index) -{ - return x86_pmu.perfctr + x86_pmu_addr_offset(index); -} - /* * Find and validate any extra registers to set up. */ @@ -532,9 +243,6 @@ msr_fail: return false; } -static void reserve_ds_buffers(void); -static void release_ds_buffers(void); - static void hw_perf_event_destroy(struct perf_event *event) { if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { @@ -583,7 +291,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return x86_pmu_extra_regs(val, event); } -static int x86_setup_perfctr(struct perf_event *event) +int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; @@ -604,12 +312,8 @@ static int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } - /* - * Do not allow config1 (extended registers) to propagate, - * there's no sane user-space generalization yet: - */ if (attr->type == PERF_TYPE_RAW) - return 0; + return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); @@ -647,7 +351,7 @@ static int x86_setup_perfctr(struct perf_event *event) return 0; } -static int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = 0; @@ -723,7 +427,7 @@ static int __x86_pmu_event_init(struct perf_event *event) return x86_pmu.hw_config(event); } -static void x86_pmu_disable_all(void) +void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -758,15 +462,7 @@ static void x86_pmu_disable(struct pmu *pmu) x86_pmu.disable_all(); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - if (hwc->extra_reg.reg) - wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); - wrmsrl(hwc->config_base, hwc->config | enable_mask); -} - -static void x86_pmu_enable_all(int added) +void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -788,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } -static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 + +struct perf_sched { + int max_weight; + int max_events; + struct event_constraint **constraints; + struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->constraints = c; + + for (idx = 0; idx < num; idx++) { + if (c[idx]->weight == wmin) + break; + } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) + return; + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ + if (!sched->saved_states) + return false; + + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; + + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) +{ + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->constraints[sched->state.event]; + + /* Prefer fixed purpose counters */ + if (x86_pmu.num_counters_fixed) { + idx = X86_PMC_IDX_FIXED; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + + return false; + +done: + sched->state.counter = idx; + + if (c->overlap) + perf_sched_save_state(sched); + + return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + + return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->constraints[sched->state.event]; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; +} + +/* + * Assign a counter for each event. + */ +static int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) +{ + struct perf_sched sched; + + perf_sched_init(&sched, constraints, n, wmin, wmax); + + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; +} + +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int i, j, w, wmax, num = 0; + int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); - for (i = 0; i < n; i++) { + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); } /* @@ -825,60 +698,12 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (i == n) - goto done; - - /* - * begin slow path - */ - bitmap_zero(used_mask, X86_PMC_IDX_MAX); + /* slow path */ + if (i != n) + num = perf_assign_events(constraints, n, wmin, wmax, assign); /* - * weight = number of possible counters - * - * 1 = most constrained, only works on one counter - * wmax = least constrained, works on any counter - * - * assign events to counters starting with most - * constrained events. - */ - wmax = x86_pmu.num_counters; - - /* - * when fixed event counters are present, - * wmax is incremented by 1 to account - * for one more choice - */ - if (x86_pmu.num_counters_fixed) - wmax++; - - for (w = 1, num = n; num && w <= wmax; w++) { - /* for each event */ - for (i = 0; num && i < n; i++) { - c = constraints[i]; - hwc = &cpuc->event_list[i]->hw; - - if (c->weight != w) - continue; - - for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { - if (!test_bit(j, used_mask)) - break; - } - - if (j == X86_PMC_IDX_MAX) - break; - - __set_bit(j, used_mask); - - if (assign) - assign[i] = j; - num--; - } - } -done: - /* * scheduling failed or is just a simulation, * free resources if necessary */ @@ -888,7 +713,7 @@ done: x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); } } - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } /* @@ -907,7 +732,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (is_x86_event(leader)) { if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = leader; n++; } @@ -920,7 +745,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, continue; if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = event; n++; @@ -959,7 +784,6 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, } static void x86_pmu_start(struct perf_event *event, int flags); -static void x86_pmu_stop(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { @@ -1031,21 +855,13 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu.enable_all(added); } -static inline void x86_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base, hwc->config); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ -static int -x86_perf_event_set_period(struct perf_event *event) +int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); @@ -1105,7 +921,7 @@ x86_perf_event_set_period(struct perf_event *event) return ret; } -static void x86_pmu_enable_event(struct perf_event *event) +void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, @@ -1244,7 +1060,7 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_stop(struct perf_event *event, int flags) +void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -1297,7 +1113,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) perf_event_update_userpage(event); } -static int x86_pmu_handle_irq(struct pt_regs *regs) +int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -1367,109 +1183,28 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -struct pmu_nmi_state { - unsigned int marked; - int handled; -}; - -static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); - static int __kprobes -perf_event_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) +perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = __args; - unsigned int this_nmi; - int handled; - if (!atomic_read(&active_events)) - return NOTIFY_DONE; + return NMI_DONE; - switch (cmd) { - case DIE_NMI: - break; - case DIE_NMIUNKNOWN: - this_nmi = percpu_read(irq_stat.__nmi_count); - if (this_nmi != __this_cpu_read(pmu_nmi.marked)) - /* let the kernel handle the unknown nmi */ - return NOTIFY_DONE; - /* - * This one is a PMU back-to-back nmi. Two events - * trigger 'simultaneously' raising two back-to-back - * NMIs. If the first NMI handles both, the latter - * will be empty and daze the CPU. So, we drop it to - * avoid false-positive 'unknown nmi' messages. - */ - return NOTIFY_STOP; - default: - return NOTIFY_DONE; - } - - handled = x86_pmu.handle_irq(args->regs); - if (!handled) - return NOTIFY_DONE; - - this_nmi = percpu_read(irq_stat.__nmi_count); - if ((handled > 1) || - /* the next nmi could be a back-to-back nmi */ - ((__this_cpu_read(pmu_nmi.marked) == this_nmi) && - (__this_cpu_read(pmu_nmi.handled) > 1))) { - /* - * We could have two subsequent back-to-back nmis: The - * first handles more than one counter, the 2nd - * handles only one counter and the 3rd handles no - * counter. - * - * This is the 2nd nmi because the previous was - * handling more than one counter. We will mark the - * next (3rd) and then drop it if unhandled. - */ - __this_cpu_write(pmu_nmi.marked, this_nmi + 1); - __this_cpu_write(pmu_nmi.handled, handled); - } - - return NOTIFY_STOP; + return x86_pmu.handle_irq(regs); } -static __read_mostly struct notifier_block perf_event_nmi_notifier = { - .notifier_call = perf_event_nmi_handler, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - -static struct event_constraint unconstrained; -static struct event_constraint emptyconstraint; - -static struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct event_constraint *c; - - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) - return c; - } - } - - return &unconstrained; -} - -#include "perf_event_amd.c" -#include "perf_event_p6.c" -#include "perf_event_p4.c" -#include "perf_event_intel_lbr.c" -#include "perf_event_intel_ds.c" -#include "perf_event_intel.c" +struct event_constraint emptyconstraint; +struct event_constraint unconstrained; static int __cpuinit x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + cpuc->kfree_on_online = NULL; if (x86_pmu.cpu_prepare) ret = x86_pmu.cpu_prepare(cpu); break; @@ -1479,6 +1214,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) x86_pmu.cpu_starting(cpu); break; + case CPU_ONLINE: + kfree(cpuc->kfree_on_online); + break; + case CPU_DYING: if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); @@ -1509,6 +1248,7 @@ static void __init pmu_check_apic(void) static int __init init_hw_perf_events(void) { + struct x86_pmu_quirk *quirk; struct event_constraint *c; int err; @@ -1537,8 +1277,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.quirks) - x86_pmu.quirks(); + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -1557,16 +1297,22 @@ static int __init init_hw_perf_events(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; perf_events_lapic_init(); - register_die_notifier(&perf_event_nmi_notifier); + register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters); + 0, x86_pmu.num_counters, 0); if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK) + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { continue; + } c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; c->weight += x86_pmu.num_counters; @@ -1702,7 +1448,7 @@ static int validate_event(struct perf_event *event) c = x86_pmu.get_event_constraints(fake_cpuc, event); if (!c || !c->weight) - ret = -ENOSPC; + ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); @@ -1727,7 +1473,7 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret = -ENOSPC, n; + int ret = -EINVAL, n; fake_cpuc = allocate_fake_cpuc(); if (IS_ERR(fake_cpuc)) @@ -1900,6 +1646,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; @@ -1953,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs) return misc; } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h new file mode 100644 index 00000000000..8944062f46e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.h @@ -0,0 +1,550 @@ +/* + * Performance events x86 architecture header + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> + +/* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; + int overlap; +}; + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +/* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + +#define MAX_LBR_ENTRIES 16 + +struct cpu_hw_events { + /* + * Generic x86 PMC bits + */ + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int enabled; + + int n_events; + int n_added; + int n_txn; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + + unsigned int group_flag; + + /* + * Intel DebugStore bits + */ + struct debug_store *ds; + u64 pebs_enabled; + + /* + * Intel LBR bits + */ + int lbr_users; + void *lbr_context; + struct perf_branch_stack lbr_stack; + struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + + /* + * Intel host/guest exclude bits + */ + u64 intel_ctrl_guest_mask; + u64 intel_ctrl_host_mask; + struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; + + /* + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB + */ + struct intel_shared_regs *shared_regs; + + /* + * AMD specific bits + */ + struct amd_nb *amd_nb; + + void *kfree_on_online; +}; + +#define __EVENT_CONSTRAINT(c, n, m, w, o) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ + .overlap = (o), \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1) + +/* + * Constraint on the Event code. + */ +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) + +/* + * Constraint on the Event code + UMask + fixed-mask + * + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) + +/* + * Constraint on the Event code + UMask + */ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->weight; (e)++) + +/* + * Extra registers for specific events. + * + * Some events need large masks and require external MSRs. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i \ + } + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) + +union perf_capabilities { + struct { + u64 lbr_format:6; + u64 pebs_trap:1; + u64 pebs_arch_reg:1; + u64 pebs_format:4; + u64 smm_freeze:1; + }; + u64 capabilities; +}; + +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + /* + * Generic x86 PMC bits + */ + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(int added); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); + int (*hw_config)(struct perf_event *event); + int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + int max_events; + int num_counters; + int num_counters_fixed; + int cntval_bits; + u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; + int apic; + u64 max_period; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; + struct x86_pmu_quirk *quirks; + int perfctr_second_write; + + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); + + /* + * Intel Arch Perfmon v2+ + */ + u64 intel_ctrl; + union perf_capabilities intel_cap; + + /* + * Intel DebugStore bits + */ + int bts, pebs; + int bts_active, pebs_active; + int pebs_record_size; + void (*drain_pebs)(struct pt_regs *regs); + struct event_constraint *pebs_constraints; + + /* + * Intel LBR + */ + unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ + int lbr_nr; /* hardware stack size */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; + unsigned int er_flags; + + /* + * Intel host/guest support (KVM) + */ + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); +}; + +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + +extern struct x86_pmu x86_pmu __read_mostly; + +DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +int x86_perf_event_set_period(struct perf_event *event); + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +extern u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +extern u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +u64 x86_perf_event_update(struct perf_event *event); + +static inline int x86_pmu_addr_offset(int index) +{ + int offset; + + /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ + alternative_io(ASM_NOP2, + "shll $1, %%eax", + X86_FEATURE_PERFCTR_CORE, + "=a" (offset), + "a" (index)); + + return offset; +} + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + x86_pmu_addr_offset(index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + x86_pmu_addr_offset(index); +} + +int x86_setup_perfctr(struct perf_event *event); + +int x86_pmu_hw_config(struct perf_event *event); + +void x86_pmu_disable_all(void); + +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + wrmsrl(hwc->config_base, hwc->config | enable_mask); +} + +void x86_pmu_enable_all(int added); + +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); + +void x86_pmu_stop(struct perf_event *event, int flags); + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +void x86_pmu_enable_event(struct perf_event *event); + +int x86_pmu_handle_irq(struct pt_regs *regs); + +extern struct event_constraint emptyconstraint; + +extern struct event_constraint unconstrained; + +#ifdef CONFIG_CPU_SUP_AMD + +int amd_pmu_init(void); + +#else /* CONFIG_CPU_SUP_AMD */ + +static inline int amd_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL + +int intel_pmu_save_and_restart(struct perf_event *event); + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); + +struct intel_shared_regs *allocate_shared_regs(int cpu); + +int intel_pmu_init(void); + +void init_debug_store_on_cpu(int cpu); + +void fini_debug_store_on_cpu(int cpu); + +void release_ds_buffers(void); + +void reserve_ds_buffers(void); + +extern struct event_constraint bts_constraint; + +void intel_pmu_enable_bts(u64 config); + +void intel_pmu_disable_bts(void); + +int intel_pmu_drain_bts_buffer(void); + +extern struct event_constraint intel_core2_pebs_event_constraints[]; + +extern struct event_constraint intel_atom_pebs_event_constraints[]; + +extern struct event_constraint intel_nehalem_pebs_event_constraints[]; + +extern struct event_constraint intel_westmere_pebs_event_constraints[]; + +extern struct event_constraint intel_snb_pebs_event_constraints[]; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event); + +void intel_pmu_pebs_enable(struct perf_event *event); + +void intel_pmu_pebs_disable(struct perf_event *event); + +void intel_pmu_pebs_enable_all(void); + +void intel_pmu_pebs_disable_all(void); + +void intel_ds_init(void); + +void intel_pmu_lbr_reset(void); + +void intel_pmu_lbr_enable(struct perf_event *event); + +void intel_pmu_lbr_disable(struct perf_event *event); + +void intel_pmu_lbr_enable_all(void); + +void intel_pmu_lbr_disable_all(void); + +void intel_pmu_lbr_read(void); + +void intel_pmu_lbr_init_core(void); + +void intel_pmu_lbr_init_nhm(void); + +void intel_pmu_lbr_init_atom(void); + +int p4_pmu_init(void); + +int p6_pmu_init(void); + +#else /* CONFIG_CPU_SUP_INTEL */ + +static inline void reserve_ds_buffers(void) +{ +} + +static inline void release_ds_buffers(void) +{ +} + +static inline int intel_pmu_init(void) +{ + return 0; +} + +static inline struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 941caa2e449..0397b23be8e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,4 +1,10 @@ -#ifdef CONFIG_CPU_SUP_AMD +#include <linux/perf_event.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <asm/apicdef.h> + +#include "perf_event.h" static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -132,6 +138,19 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY; + if (event->attr.type != PERF_TYPE_RAW) return 0; @@ -350,7 +369,7 @@ static void amd_pmu_cpu_starting(int cpu) continue; if (nb->nb_id == nb_id) { - kfree(cpuc->amd_nb); + cpuc->kfree_on_online = cpuc->amd_nb; cpuc->amd_nb = nb; break; } @@ -392,7 +411,7 @@ static __initconst const struct x86_pmu amd_pmu = { .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -473,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = { static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); @@ -556,7 +575,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .perfctr = MSR_F15H_PERF_CTR, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 6, + .num_counters = AMD64_NUM_COUNTERS_F15H, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -573,7 +592,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { #endif }; -static __init int amd_pmu_init(void) +__init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) @@ -602,12 +621,3 @@ static __init int amd_pmu_init(void) return 0; } - -#else /* CONFIG_CPU_SUP_AMD */ - -static int amd_pmu_init(void) -{ - return 0; -} - -#endif diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c new file mode 100644 index 00000000000..3b8a2d30d14 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -0,0 +1,301 @@ +/* + * Performance events - AMD IBS + * + * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <asm/apic.h> + +static u32 ibs_caps; + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + +static struct pmu perf_ibs; + +static int perf_ibs_init(struct perf_event *event) +{ + if (perf_ibs.type != event->attr.type) + return -ENOENT; + return 0; +} + +static int perf_ibs_add(struct perf_event *event, int flags) +{ + return 0; +} + +static void perf_ibs_del(struct perf_event *event, int flags) +{ +} + +static struct pmu perf_ibs = { + .event_init= perf_ibs_init, + .add= perf_ibs_add, + .del= perf_ibs_del, +}; + +static __init int perf_event_ibs_init(void) +{ + if (!ibs_caps) + return -ENODEV; /* ibs not supported by the cpu */ + + perf_pmu_register(&perf_ibs, "ibs", -1); + printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); + + return 0; +} + +#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ + +static __init int perf_event_ibs_init(void) { return 0; } + +#endif + +/* IBS - apic initialization, for perf and oprofile */ + +static __init u32 __get_ibs_caps(void) +{ + u32 caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return caps; +} + +u32 get_ibs_caps(void) +{ + return ibs_caps; +} + +EXPORT_SYMBOL(get_ibs_caps); + +static inline int get_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + +/* + * Check and reserve APIC extended interrupt LVT offset for IBS if available. + */ +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + int valid = 0; + + preempt_disable(); + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + if (!get_eilvt(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + valid = 1; +out: + preempt_enable(); + + return valid; +} + +static int setup_ibs_ctl(int ibs_eilvt_off) +{ + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVT_OFFSET_VALID); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { + pci_dev_put(cpu_cfg); + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x\n", value); + return -EINVAL; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that + * is using the new offset. + */ +static int force_ibs_eilvt_setup(void) +{ + int offset; + int ret; + + preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; + } + preempt_enable(); + + if (offset == APIC_EILVT_NR_MAX) { + printk(KERN_DEBUG "No EILVT entry available\n"); + return -EBUSY; + } + + ret = setup_ibs_ctl(offset); + if (ret) + goto out; + + if (!ibs_eilvt_valid()) { + ret = -EFAULT; + goto out; + } + + pr_info("IBS: LVT offset %d assigned\n", offset); + + return 0; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return ret; +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("perf: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +static int __cpuinit +perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + setup_APIC_ibs(NULL); + break; + case CPU_DYING: + clear_APIC_ibs(NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static __init int amd_ibs_init(void) +{ + u32 caps; + int ret = -EINVAL; + + caps = __get_ibs_caps(); + if (!caps) + return -ENODEV; /* ibs not supported by the cpu */ + + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); + + if (!ibs_eilvt_valid()) + goto out; + + get_online_cpus(); + ibs_caps = caps; + /* make ibs_caps visible to other cpus: */ + smp_mb(); + perf_cpu_notifier(perf_ibs_cpu_notifier); + smp_call_function(setup_APIC_ibs, NULL, 1); + put_online_cpus(); + + ret = perf_event_ibs_init(); +out: + if (ret) + pr_err("Failed to setup IBS, %d\n", ret); + return ret; +} + +/* Since we need the pci subsystem to init ibs we can't do this earlier: */ +device_initcall(amd_ibs_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 45fbb8f7f54..3bd37bdf1b8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,16 +1,20 @@ -#ifdef CONFIG_CPU_SUP_INTEL - /* * Per core/cpu state * * Used to coordinate shared registers between HT threads or * among events on a single PMU. */ -struct intel_shared_regs { - struct er_account regs[EXTRA_REG_MAX]; - int refcnt; /* per-core: #HT threads */ - unsigned core_id; /* per-core: core id */ -}; + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "perf_event.h" /* * Intel PerfMon, used on Core and later. @@ -24,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -41,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* - * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event - * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed - * ratio between these counters. - */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -64,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -86,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -98,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -121,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -746,7 +746,8 @@ static void intel_pmu_enable_all(int added) intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = @@ -869,6 +870,7 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -876,6 +878,9 @@ static void intel_pmu_disable_event(struct perf_event *event) return; } + cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); + cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -921,6 +926,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) @@ -930,6 +936,11 @@ static void intel_pmu_enable_event(struct perf_event *event) return; } + if (event->attr.exclude_host) + cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); + if (event->attr.exclude_guest) + cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc); return; @@ -945,7 +956,7 @@ static void intel_pmu_enable_event(struct perf_event *event) * Save and restart an expired event. Called by NMI contexts, * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_event *event) +int intel_pmu_save_and_restart(struct perf_event *event) { x86_perf_event_update(event); return x86_perf_event_set_period(event); @@ -1154,7 +1165,7 @@ again: */ c = &unconstrained; } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1197,6 +1208,21 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, return c; } +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -1284,12 +1310,84 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + if (x86_pmu.guest_get_msrs) + return x86_pmu.guest_get_msrs(nr); + *nr = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + + arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; + arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + + *nr = 1; + return arr; +} + +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + + arr[idx].msr = x86_pmu_config_addr(idx); + arr[idx].host = arr[idx].guest = 0; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + arr[idx].host = arr[idx].guest = + event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; + + if (event->attr.exclude_host) + arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + else if (event->attr.exclude_guest) + arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + } + + *nr = x86_pmu.num_counters; + return arr; +} + +static void core_pmu_enable_event(struct perf_event *event) +{ + if (!event->attr.exclude_host) + x86_pmu_enable_event(event); +} + +static void core_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask) || + cpuc->events[idx]->attr.exclude_host) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, .disable = x86_pmu_disable_event, .hw_config = x86_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -1307,9 +1405,10 @@ static __initconst const struct x86_pmu core_pmu = { .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, }; -static struct intel_shared_regs *allocate_shared_regs(int cpu) +struct intel_shared_regs *allocate_shared_regs(int cpu) { struct intel_shared_regs *regs; int i; @@ -1362,7 +1461,7 @@ static void intel_pmu_cpu_starting(int cpu) pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - kfree(cpuc->shared_regs); + cpuc->kfree_on_online = cpuc->shared_regs; cpuc->shared_regs = pc; break; } @@ -1413,9 +1512,10 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .guest_get_msrs = intel_guest_get_msrs, }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void) { /* * PEBS is unreliable due to: @@ -1441,12 +1541,60 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } -static __init int intel_pmu_init(void) +static __init void intel_sandybridge_quirk(void) +{ + printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + x86_pmu.pebs = 0; + x86_pmu.pebs_constraints = NULL; +} + +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + } +} + +__init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; + union cpuid10_ebx ebx; unsigned int unused; - unsigned int ebx; int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -1463,8 +1611,8 @@ static __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; version = eax.split.version_id; @@ -1478,6 +1626,9 @@ static __init int intel_pmu_init(void) x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1497,6 +1648,8 @@ static __init int intel_pmu_init(void) intel_ds_init(); + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + /* * Install the hw-cache-events table: */ @@ -1506,7 +1659,7 @@ static __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - x86_pmu.quirks = intel_clovertown_quirks; + x86_add_quirk(intel_clovertown_quirk); case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1540,17 +1693,8 @@ static __init int intel_pmu_init(void) /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; - if (ebx & 0x40) { - /* - * Erratum AAJ80 detected, we work it around by using - * the BR_MISP_EXEC.ANY event. This will over-count - * branch-misses, but it's still much better than the - * architectural event which is often completely bogus: - */ - intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + x86_add_quirk(intel_nehalem_quirk); - pr_cont("erratum AAJ80 worked around, "); - } pr_cont("Nehalem events, "); break; @@ -1590,13 +1734,15 @@ static __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ + x86_add_quirk(intel_sandybridge_quirk); + case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_snb_event_constraints; - x86_pmu.pebs_constraints = intel_snb_pebs_events; + x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; @@ -1625,18 +1771,6 @@ static __init int intel_pmu_init(void) break; } } - return 0; -} - -#else /* CONFIG_CPU_SUP_INTEL */ -static int intel_pmu_init(void) -{ return 0; } - -static struct intel_shared_regs *allocate_shared_regs(int cpu) -{ - return NULL; -} -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 1b1ef3addcf..73da6b64f5b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1,7 +1,10 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/slab.h> -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#include <asm/perf_event.h> + +#include "perf_event.h" /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -37,24 +40,7 @@ struct pebs_record_nhm { u64 status, dla, dse, lat; }; -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - -static void init_debug_store_on_cpu(int cpu) +void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -66,7 +52,7 @@ static void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static void fini_debug_store_on_cpu(int cpu) +void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -175,7 +161,7 @@ static void release_ds_buffer(int cpu) kfree(ds); } -static void release_ds_buffers(void) +void release_ds_buffers(void) { int cpu; @@ -194,7 +180,7 @@ static void release_ds_buffers(void) put_online_cpus(); } -static void reserve_ds_buffers(void) +void reserve_ds_buffers(void) { int bts_err = 0, pebs_err = 0; int cpu; @@ -260,10 +246,10 @@ static void reserve_ds_buffers(void) * BTS */ -static struct event_constraint bts_constraint = +struct event_constraint bts_constraint = EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); -static void intel_pmu_enable_bts(u64 config) +void intel_pmu_enable_bts(u64 config) { unsigned long debugctlmsr; @@ -282,7 +268,7 @@ static void intel_pmu_enable_bts(u64 config) update_debugctlmsr(debugctlmsr); } -static void intel_pmu_disable_bts(void) +void intel_pmu_disable_bts(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long debugctlmsr; @@ -299,7 +285,7 @@ static void intel_pmu_disable_bts(void) update_debugctlmsr(debugctlmsr); } -static int intel_pmu_drain_bts_buffer(void) +int intel_pmu_drain_bts_buffer(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -361,7 +347,7 @@ static int intel_pmu_drain_bts_buffer(void) /* * PEBS */ -static struct event_constraint intel_core2_pebs_event_constraints[] = { +struct event_constraint intel_core2_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ @@ -370,14 +356,14 @@ static struct event_constraint intel_core2_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_atom_pebs_event_constraints[] = { +struct event_constraint intel_atom_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_pebs_event_constraints[] = { +struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ @@ -392,7 +378,7 @@ static struct event_constraint intel_nehalem_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_pebs_event_constraints[] = { +struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ @@ -407,7 +393,7 @@ static struct event_constraint intel_westmere_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_snb_pebs_events[] = { +struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ @@ -428,8 +414,7 @@ static struct event_constraint intel_snb_pebs_events[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint * -intel_pebs_constraints(struct perf_event *event) +struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -446,7 +431,7 @@ intel_pebs_constraints(struct perf_event *event) return &emptyconstraint; } -static void intel_pmu_pebs_enable(struct perf_event *event) +void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -460,7 +445,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event) intel_pmu_lbr_enable(event); } -static void intel_pmu_pebs_disable(struct perf_event *event) +void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -475,7 +460,7 @@ static void intel_pmu_pebs_disable(struct perf_event *event) intel_pmu_lbr_disable(event); } -static void intel_pmu_pebs_enable_all(void) +void intel_pmu_pebs_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -483,7 +468,7 @@ static void intel_pmu_pebs_enable_all(void) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); } -static void intel_pmu_pebs_disable_all(void) +void intel_pmu_pebs_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -508,6 +493,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long from = cpuc->lbr_entries[0].from; unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; + int is_64bit = 0; /* * We don't need to fixup if the PEBS assist is fault like @@ -559,7 +545,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) } else kaddr = (void *)to; - kernel_insn_init(&insn, kaddr); +#ifdef CONFIG_X86_64 + is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, kaddr, is_64bit); insn_get_length(&insn); to += insn.length; } while (to < ip); @@ -576,8 +565,6 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static int intel_pmu_save_and_restart(struct perf_event *event); - static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { @@ -716,7 +703,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * BTS, PEBS probe and setup */ -static void intel_ds_init(void) +void intel_ds_init(void) { /* * No support for 32bit formats @@ -749,15 +736,3 @@ static void intel_ds_init(void) } } } - -#else /* CONFIG_CPU_SUP_INTEL */ - -static void reserve_ds_buffers(void) -{ -} - -static void release_ds_buffers(void) -{ -} - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d202c1bece1..3fab3de3ce9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -1,4 +1,10 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/perf_event.h> +#include <linux/types.h> + +#include <asm/perf_event.h> +#include <asm/msr.h> + +#include "perf_event.h" enum { LBR_FORMAT_32 = 0x00, @@ -48,7 +54,7 @@ static void intel_pmu_lbr_reset_64(void) } } -static void intel_pmu_lbr_reset(void) +void intel_pmu_lbr_reset(void) { if (!x86_pmu.lbr_nr) return; @@ -59,7 +65,7 @@ static void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } -static void intel_pmu_lbr_enable(struct perf_event *event) +void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -81,7 +87,7 @@ static void intel_pmu_lbr_enable(struct perf_event *event) cpuc->lbr_users++; } -static void intel_pmu_lbr_disable(struct perf_event *event) +void intel_pmu_lbr_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -95,7 +101,7 @@ static void intel_pmu_lbr_disable(struct perf_event *event) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_enable_all(void) +void intel_pmu_lbr_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -103,7 +109,7 @@ static void intel_pmu_lbr_enable_all(void) __intel_pmu_lbr_enable(); } -static void intel_pmu_lbr_disable_all(void) +void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -178,7 +184,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.nr = i; } -static void intel_pmu_lbr_read(void) +void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -191,7 +197,7 @@ static void intel_pmu_lbr_read(void) intel_pmu_lbr_read_64(cpuc); } -static void intel_pmu_lbr_init_core(void) +void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; x86_pmu.lbr_tos = 0x01c9; @@ -199,7 +205,7 @@ static void intel_pmu_lbr_init_core(void) x86_pmu.lbr_to = 0x60; } -static void intel_pmu_lbr_init_nhm(void) +void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = 0x01c9; @@ -207,12 +213,10 @@ static void intel_pmu_lbr_init_nhm(void) x86_pmu.lbr_to = 0x6c0; } -static void intel_pmu_lbr_init_atom(void) +void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; x86_pmu.lbr_tos = 0x01c9; x86_pmu.lbr_from = 0x40; x86_pmu.lbr_to = 0x60; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 7809d2bcb20..ef484d9d0a2 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -7,9 +7,13 @@ * For licencing details see kernel-base/COPYING */ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/perf_event.h> #include <asm/perf_event_p4.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "perf_event.h" #define P4_CNTR_LIMIT 3 /* @@ -1264,7 +1268,7 @@ reserve: } done: - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } static __initconst const struct x86_pmu p4_pmu = { @@ -1303,7 +1307,7 @@ static __initconst const struct x86_pmu p4_pmu = { .perfctr_second_write = 1, }; -static __init int p4_pmu_init(void) +__init int p4_pmu_init(void) { unsigned int low, high; @@ -1326,5 +1330,3 @@ static __init int p4_pmu_init(void) return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 20c097e3386..c7181befecd 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -1,4 +1,7 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/perf_event.h> +#include <linux/types.h> + +#include "perf_event.h" /* * Not sure about some of these @@ -114,7 +117,7 @@ static __initconst const struct x86_pmu p6_pmu = { .event_constraints = p6_event_constraints, }; -static __init int p6_pmu_init(void) +__init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -138,5 +141,3 @@ static __init int p6_pmu_init(void) return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea297e0..7b3fe56b1c2 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = { "100mhzsteps", "hwpstate", "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 62ac8cb6ba2..8022c668148 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" @@ -85,6 +83,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 00000000000..feca286c2bb --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,73 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu <fenghua.yu@intel.com>, + * H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <asm/processor.h> +#include <asm/archrandom.h> +#include <asm/sections.h> + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* We can't use arch_get_random_long() here since alternatives haven't run */ +static inline int rdrand_long(unsigned long *v) +{ + int ok; + asm volatile("1: " RDRAND_LONG "\n\t" + "jc 2f\n\t" + "decl %0\n\t" + "jnz 1b\n\t" + "2:" + : "=r" (ok), "=a" (*v) + : "0" (RDRAND_RETRY_LOOPS)); + return ok; +} + +/* + * Force a reseed cycle; we are architecturally guaranteed a reseed + * after no more than 512 128-bit chunks of random data. This also + * acts as a test of the CPU capability. + */ +#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) + +void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_RANDOM + unsigned long tmp; + int i, count, ok; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; /* Nothing to do */ + + for (count = i = 0; i < RESEED_LOOP; i++) { + ok = rdrand_long(&tmp); + if (ok) + count++; + } + + if (count != RESEED_LOOP) + clear_cpu_cap(c, X86_FEATURE_RDRAND); +#endif +} |