diff options
29 files changed, 1056 insertions, 683 deletions
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 4886a68f267..fd3f9f18cf3 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -22,27 +22,26 @@ void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif -/* - * Define some priorities for the nmi notifier call chain. - * - * Create a local nmi bit that has a higher priority than - * external nmis, because the local ones are more frequent. - * - * Also setup some default high/normal/low settings for - * subsystems to registers with. Using 4 bits to separate - * the priorities. This can go a lot higher if needed be. - */ - -#define NMI_LOCAL_SHIFT 16 /* randomly picked */ -#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT) -#define NMI_HIGH_PRIOR (1ULL << 8) -#define NMI_NORMAL_PRIOR (1ULL << 4) -#define NMI_LOW_PRIOR (1ULL << 0) -#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR) -#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR) -#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR) +#define NMI_FLAG_FIRST 1 + +enum { + NMI_LOCAL=0, + NMI_UNKNOWN, + NMI_MAX +}; + +#define NMI_DONE 0 +#define NMI_HANDLED 1 + +typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *); + +int register_nmi_handler(unsigned int, nmi_handler_t, unsigned long, + const char *); + +void unregister_nmi_handler(unsigned int, const char *); void stop_nmi(void); void restart_nmi(void); +void local_touch_nmi(void); #endif /* _ASM_X86_NMI_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 094fb30817a..f61c62f7d5d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -29,6 +29,9 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40) +#define AMD_PERFMON_EVENTSEL_HOSTONLY (1ULL << 41) + #define AMD64_EVENTSEL_EVENT \ (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32)) #define INTEL_ARCH_EVENT_MASK \ @@ -43,14 +46,17 @@ #define AMD64_RAW_EVENT_MASK \ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) +#define AMD64_NUM_COUNTERS 4 +#define AMD64_NUM_COUNTERS_F15H 6 +#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 /* * Intel "Architectural Performance Monitoring" CPUID @@ -110,6 +116,35 @@ union cpuid10_edx { */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +/* + * IBS cpuid feature detection + */ + +#define IBS_CPUID_FEATURES 0x8000001b + +/* + * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but + * bit 0 is used to indicate the existence of IBS. + */ +#define IBS_CAPS_AVAIL (1U<<0) +#define IBS_CAPS_FETCHSAM (1U<<1) +#define IBS_CAPS_OPSAM (1U<<2) +#define IBS_CAPS_RDWROPCNT (1U<<3) +#define IBS_CAPS_OPCNT (1U<<4) +#define IBS_CAPS_BRNTRGT (1U<<5) +#define IBS_CAPS_OPCNTEXT (1U<<6) + +#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ + | IBS_CAPS_FETCHSAM \ + | IBS_CAPS_OPSAM) + +/* + * IBS APIC setup + */ +#define IBSCTL 0x1cc +#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) +#define IBSCTL_LVT_OFFSET_MASK 0x0F + /* IbsFetchCtl bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) @@ -124,6 +159,8 @@ union cpuid10_edx { #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +extern u32 get_ibs_caps(void); + #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); @@ -159,7 +196,19 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); ); \ } +struct perf_guest_switch_msr { + unsigned msr; + u64 host, guest; +}; + +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); #else +static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + *nr = 0; + return NULL; +} + static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 3250e3d605d..92f297069e8 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -23,7 +23,7 @@ void machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -typedef void (*nmi_shootdown_cb)(int, struct die_args*); +typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 82f2912155a..8baca3c4871 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -19,7 +19,7 @@ endif obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time.o ioport.o ldt.o dumpstack.o +obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index d5e57db0f7b..31cb9ae992b 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -60,22 +60,10 @@ void arch_trigger_all_cpu_backtrace(void) } static int __kprobes -arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, - unsigned long cmd, void *__args) +arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = __args; - struct pt_regs *regs; int cpu; - switch (cmd) { - case DIE_NMI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { @@ -86,21 +74,16 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, show_regs(regs); arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - return NOTIFY_STOP; + return NMI_HANDLED; } - return NOTIFY_DONE; + return NMI_DONE; } -static __read_mostly struct notifier_block backtrace_notifier = { - .notifier_call = arch_trigger_all_cpu_backtrace_handler, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - static int __init register_trigger_all_cpu_backtrace(void) { - register_die_notifier(&backtrace_notifier); + register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, + 0, "arch_bt"); return 0; } early_initcall(register_trigger_all_cpu_backtrace); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 34b18594e72..75be00ecfff 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -672,18 +672,11 @@ void __cpuinit uv_cpu_init(void) /* * When NMI is received, print a stack trace. */ -int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) { unsigned long real_uv_nmi; int bid; - if (reason != DIE_NMIUNKNOWN) - return NOTIFY_OK; - - if (in_crash_kexec) - /* do nothing if entering the crash kernel */ - return NOTIFY_OK; - /* * Each blade has an MMR that indicates when an NMI has been sent * to cpus on the blade. If an NMI is detected, atomically @@ -704,7 +697,7 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) - return NOTIFY_DONE; + return NMI_DONE; __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; @@ -717,17 +710,12 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) dump_stack(); spin_unlock(&uv_nmi_lock); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi, - .priority = NMI_LOCAL_LOW_PRIOR - 1, -}; - void uv_register_nmi_notifier(void) { - if (register_die_notifier(&uv_dump_stack_nmi_nb)) + if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) printk(KERN_WARNING "UV NMI handler failed to register\n"); } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1044fd787db..fe6eb197f84 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ endif obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 0ed633c5048..6199232161c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -78,27 +78,20 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) static cpumask_var_t mce_inject_cpumask; -static int mce_raise_notify(struct notifier_block *self, - unsigned long val, void *data) +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) - return NOTIFY_DONE; + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) - raise_exception(m, args->regs); + raise_exception(m, regs); else if (m->status) raise_poll(m); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block mce_raise_nb = { - .notifier_call = mce_raise_notify, - .priority = NMI_LOCAL_NORMAL_PRIOR, -}; - /* Inject mce on current CPU */ static int raise_local(void) { @@ -216,7 +209,8 @@ static int inject_init(void) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; - register_die_notifier(&mce_raise_nb); + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, + "mce_notify"); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 08363b04212..fce51ad1f36 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -908,9 +908,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) percpu_inc(mce_exception_count); - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out; if (!banks) goto out; @@ -1140,6 +1137,15 @@ static void mce_start_timer(unsigned long data) add_timer_on(t, smp_processor_id()); } +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + static void mce_do_trigger(struct work_struct *work) { call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); @@ -1750,7 +1756,6 @@ static struct syscore_ops mce_syscore_ops = { static void mce_cpu_restart(void *data) { - del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); @@ -1760,16 +1765,15 @@ static void mce_cpu_restart(void *data) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { + mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); } /* Toggle features for corrected errors */ -static void mce_disable_ce(void *all) +static void mce_disable_cmci(void *data) { if (!mce_available(__this_cpu_ptr(&cpu_info))) return; - if (all) - del_timer_sync(&__get_cpu_var(mce_timer)); cmci_clear(); } @@ -1852,7 +1856,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, if (mce_ignore_ce ^ !!new) { if (new) { /* disable ce features */ - on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); mce_ignore_ce = 1; } else { /* enable ce features */ @@ -1875,7 +1880,7 @@ static ssize_t set_cmci_disabled(struct sys_device *s, if (mce_cmci_disabled ^ !!new) { if (new) { /* disable cmci */ - on_each_cpu(mce_disable_ce, NULL, 1); + on_each_cpu(mce_disable_cmci, NULL, 1); mce_cmci_disabled = 1; } else { /* enable cmci */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8ab89112f93..640891014b2 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1058,76 +1058,15 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -struct pmu_nmi_state { - unsigned int marked; - int handled; -}; - -static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); - static int __kprobes -perf_event_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) +perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = __args; - unsigned int this_nmi; - int handled; - if (!atomic_read(&active_events)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - break; - case DIE_NMIUNKNOWN: - this_nmi = percpu_read(irq_stat.__nmi_count); - if (this_nmi != __this_cpu_read(pmu_nmi.marked)) - /* let the kernel handle the unknown nmi */ - return NOTIFY_DONE; - /* - * This one is a PMU back-to-back nmi. Two events - * trigger 'simultaneously' raising two back-to-back - * NMIs. If the first NMI handles both, the latter - * will be empty and daze the CPU. So, we drop it to - * avoid false-positive 'unknown nmi' messages. - */ - return NOTIFY_STOP; - default: - return NOTIFY_DONE; - } - - handled = x86_pmu.handle_irq(args->regs); - if (!handled) - return NOTIFY_DONE; + return NMI_DONE; - this_nmi = percpu_read(irq_stat.__nmi_count); - if ((handled > 1) || - /* the next nmi could be a back-to-back nmi */ - ((__this_cpu_read(pmu_nmi.marked) == this_nmi) && - (__this_cpu_read(pmu_nmi.handled) > 1))) { - /* - * We could have two subsequent back-to-back nmis: The - * first handles more than one counter, the 2nd - * handles only one counter and the 3rd handles no - * counter. - * - * This is the 2nd nmi because the previous was - * handling more than one counter. We will mark the - * next (3rd) and then drop it if unhandled. - */ - __this_cpu_write(pmu_nmi.marked, this_nmi + 1); - __this_cpu_write(pmu_nmi.handled, handled); - } - - return NOTIFY_STOP; + return x86_pmu.handle_irq(regs); } -static __read_mostly struct notifier_block perf_event_nmi_notifier = { - .notifier_call = perf_event_nmi_handler, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - struct event_constraint emptyconstraint; struct event_constraint unconstrained; @@ -1232,7 +1171,7 @@ static int __init init_hw_perf_events(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; perf_events_lapic_init(); - register_die_notifier(&perf_event_nmi_notifier); + register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fb330b0a816..b9698d40ac4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -131,6 +131,13 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* + * Intel host/guest exclude bits + */ + u64 intel_ctrl_guest_mask; + u64 intel_ctrl_host_mask; + struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; + + /* * manage shared (per-core, per-cpu) registers * used on Intel NHM/WSM/SNB */ @@ -295,6 +302,11 @@ struct x86_pmu { */ struct extra_reg *extra_regs; unsigned int er_flags; + + /* + * Intel host/guest support (KVM) + */ + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; #define ERF_NO_HT_SHARING 1 diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 384450d6712..aeefd45697a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -138,6 +138,19 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY; + if (event->attr.type != PERF_TYPE_RAW) return 0; @@ -398,7 +411,7 @@ static __initconst const struct x86_pmu amd_pmu = { .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -562,7 +575,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .perfctr = MSR_F15H_PERF_CTR, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 6, + .num_counters = AMD64_NUM_COUNTERS_F15H, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c new file mode 100644 index 00000000000..ab6343d2182 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -0,0 +1,294 @@ +/* + * Performance events - AMD IBS + * + * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <asm/apic.h> + +static u32 ibs_caps; + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + +static struct pmu perf_ibs; + +static int perf_ibs_init(struct perf_event *event) +{ + if (perf_ibs.type != event->attr.type) + return -ENOENT; + return 0; +} + +static int perf_ibs_add(struct perf_event *event, int flags) +{ + return 0; +} + +static void perf_ibs_del(struct perf_event *event, int flags) +{ +} + +static struct pmu perf_ibs = { + .event_init= perf_ibs_init, + .add= perf_ibs_add, + .del= perf_ibs_del, +}; + +static __init int perf_event_ibs_init(void) +{ + if (!ibs_caps) + return -ENODEV; /* ibs not supported by the cpu */ + + perf_pmu_register(&perf_ibs, "ibs", -1); + printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); + + return 0; +} + +#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ + +static __init int perf_event_ibs_init(void) { return 0; } + +#endif + +/* IBS - apic initialization, for perf and oprofile */ + +static __init u32 __get_ibs_caps(void) +{ + u32 caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return caps; +} + +u32 get_ibs_caps(void) +{ + return ibs_caps; +} + +EXPORT_SYMBOL(get_ibs_caps); + +static inline int get_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + +/* + * Check and reserve APIC extended interrupt LVT offset for IBS if available. + */ +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + int valid = 0; + + preempt_disable(); + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + if (!get_eilvt(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + valid = 1; +out: + preempt_enable(); + + return valid; +} + +static int setup_ibs_ctl(int ibs_eilvt_off) +{ + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVT_OFFSET_VALID); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { + pci_dev_put(cpu_cfg); + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x\n", value); + return -EINVAL; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that + * is using the new offset. + */ +static int force_ibs_eilvt_setup(void) +{ + int offset; + int ret; + + preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; + } + preempt_enable(); + + if (offset == APIC_EILVT_NR_MAX) { + printk(KERN_DEBUG "No EILVT entry available\n"); + return -EBUSY; + } + + ret = setup_ibs_ctl(offset); + if (ret) + goto out; + + if (!ibs_eilvt_valid()) { + ret = -EFAULT; + goto out; + } + + pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); + pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); + + return 0; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return ret; +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("perf: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +static int __cpuinit +perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + setup_APIC_ibs(NULL); + break; + case CPU_DYING: + clear_APIC_ibs(NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static __init int amd_ibs_init(void) +{ + u32 caps; + int ret; + + caps = __get_ibs_caps(); + if (!caps) + return -ENODEV; /* ibs not supported by the cpu */ + + if (!ibs_eilvt_valid()) { + ret = force_ibs_eilvt_setup(); + if (ret) { + pr_err("Failed to setup IBS, %d\n", ret); + return ret; + } + } + + get_online_cpus(); + ibs_caps = caps; + /* make ibs_caps visible to other cpus: */ + smp_mb(); + perf_cpu_notifier(perf_ibs_cpu_notifier); + smp_call_function(setup_APIC_ibs, NULL, 1); + put_online_cpus(); + + return perf_event_ibs_init(); +} + +/* Since we need the pci subsystem to init ibs we can't do this earlier: */ +device_initcall(amd_ibs_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 61fa35750b9..e09ca20e86e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -749,7 +749,8 @@ static void intel_pmu_enable_all(int added) intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = @@ -872,6 +873,7 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -879,6 +881,9 @@ static void intel_pmu_disable_event(struct perf_event *event) return; } + cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); + cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -924,6 +929,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) @@ -933,6 +939,11 @@ static void intel_pmu_enable_event(struct perf_event *event) return; } + if (event->attr.exclude_host) + cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); + if (event->attr.exclude_guest) + cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc); return; @@ -1302,12 +1313,84 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + if (x86_pmu.guest_get_msrs) + return x86_pmu.guest_get_msrs(nr); + *nr = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + + arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; + arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + + *nr = 1; + return arr; +} + +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + + arr[idx].msr = x86_pmu_config_addr(idx); + arr[idx].host = arr[idx].guest = 0; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + arr[idx].host = arr[idx].guest = + event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; + + if (event->attr.exclude_host) + arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + else if (event->attr.exclude_guest) + arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + } + + *nr = x86_pmu.num_counters; + return arr; +} + +static void core_pmu_enable_event(struct perf_event *event) +{ + if (!event->attr.exclude_host) + x86_pmu_enable_event(event); +} + +static void core_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask) || + cpuc->events[idx]->attr.exclude_host) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, .disable = x86_pmu_disable_event, .hw_config = x86_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -1325,6 +1408,7 @@ static __initconst const struct x86_pmu core_pmu = { .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1431,6 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .guest_get_msrs = intel_guest_get_msrs, }; static void intel_clovertown_quirks(void) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 764c7c2b181..13ad89971d4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -32,15 +32,12 @@ int in_crash_kexec; #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) -static void kdump_nmi_callback(int cpu, struct die_args *args) +static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { - struct pt_regs *regs; #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; #endif - regs = args->regs; - #ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 00354d4919a..faba5771aca 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -511,28 +511,37 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) static int was_in_debug_nmi[NR_CPUS]; -static int __kgdb_notify(struct die_args *args, unsigned long cmd) +static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - struct pt_regs *regs = args->regs; - switch (cmd) { - case DIE_NMI: + case NMI_LOCAL: if (atomic_read(&kgdb_active) != -1) { /* KGDB CPU roundup */ kgdb_nmicallback(raw_smp_processor_id(), regs); was_in_debug_nmi[raw_smp_processor_id()] = 1; touch_nmi_watchdog(); - return NOTIFY_STOP; + return NMI_HANDLED; } - return NOTIFY_DONE; + break; - case DIE_NMIUNKNOWN: + case NMI_UNKNOWN: if (was_in_debug_nmi[raw_smp_processor_id()]) { was_in_debug_nmi[raw_smp_processor_id()] = 0; - return NOTIFY_STOP; + return NMI_HANDLED; } - return NOTIFY_DONE; + break; + default: + /* do nothing */ + break; + } + return NMI_DONE; +} + +static int __kgdb_notify(struct die_args *args, unsigned long cmd) +{ + struct pt_regs *regs = args->regs; + switch (cmd) { case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) @@ -590,11 +599,6 @@ kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) static struct notifier_block kgdb_notifier = { .notifier_call = kgdb_notify, - - /* - * Lowest-prio notifier priority, we want to be notified last: - */ - .priority = NMI_LOCAL_LOW_PRIOR, }; /** @@ -605,7 +609,31 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int retval; + + retval = register_die_notifier(&kgdb_notifier); + if (retval) + goto out; + + retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler, + 0, "kgdb"); + if (retval) + goto out1; + + retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler, + 0, "kgdb"); + + if (retval) + goto out2; + + return retval; + +out2: + unregister_nmi_handler(NMI_LOCAL, "kgdb"); +out1: + unregister_die_notifier(&kgdb_notifier); +out: + return retval; } static void kgdb_hw_overflow_handler(struct perf_event *event, @@ -673,6 +701,8 @@ void kgdb_arch_exit(void) breakinfo[i].pev = NULL; } } + unregister_nmi_handler(NMI_UNKNOWN, "kgdb"); + unregister_nmi_handler(NMI_LOCAL, "kgdb"); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 00000000000..7ec5bd140b8 --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,433 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2011 Don Zickus Red Hat, Inc. + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include <linux/spinlock.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/nmi.h> +#include <linux/delay.h> +#include <linux/hardirq.h> +#include <linux/slab.h> + +#include <linux/mca.h> + +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + +#include <linux/atomic.h> +#include <asm/traps.h> +#include <asm/mach_traps.h> +#include <asm/nmi.h> + +#define NMI_MAX_NAMELEN 16 +struct nmiaction { + struct list_head list; + nmi_handler_t handler; + unsigned int flags; + char *name; +}; + +struct nmi_desc { + spinlock_t lock; + struct list_head head; +}; + +static struct nmi_desc nmi_desc[NMI_MAX] = +{ + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .head = LIST_HEAD_INIT(nmi_desc[0].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .head = LIST_HEAD_INIT(nmi_desc[1].head), + }, + +}; + +struct nmi_stats { + unsigned int normal; + unsigned int unknown; + unsigned int external; + unsigned int swallow; +}; + +static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); + +static int ignore_nmis; + +int unknown_nmi_panic; +/* + * Prevent NMI reason port (0x61) being accessed simultaneously, can + * only be used in NMI handler. + */ +static DEFINE_RAW_SPINLOCK(nmi_reason_lock); + +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + +#define nmi_to_desc(type) (&nmi_desc[type]) + +static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *a; + int handled=0; + + rcu_read_lock(); + + /* + * NMIs are edge-triggered, which means if you have enough + * of them concurrently, you can lose some because only one + * can be latched at any given time. Walk the whole list + * to handle those situations. + */ + list_for_each_entry_rcu(a, &desc->head, list) + handled += a->handler(type, regs); + + rcu_read_unlock(); + + /* return total number of NMI events handled */ + return handled; +} + +static int __setup_nmi(unsigned int type, struct nmiaction *action) +{ + struct nmi_desc *desc = nmi_to_desc(type); + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + /* + * most handlers of type NMI_UNKNOWN never return because + * they just assume the NMI is theirs. Just a sanity check + * to manage expectations + */ + WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + + /* + * some handlers need to be executed first otherwise a fake + * event confuses some handlers (kdump uses this flag) + */ + if (action->flags & NMI_FLAG_FIRST) + list_add_rcu(&action->list, &desc->head); + else + list_add_tail_rcu(&action->list, &desc->head); + + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +static struct nmiaction *__free_nmi(unsigned int type, const char *name) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *n; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + list_for_each_entry_rcu(n, &desc->head, list) { + /* + * the name passed in to describe the nmi handler + * is used as the lookup key + */ + if (!strcmp(n->name, name)) { + WARN(in_nmi(), + "Trying to free NMI (%s) from NMI context!\n", n->name); + list_del_rcu(&n->list); + break; + } + } + + spin_unlock_irqrestore(&desc->lock, flags); + synchronize_rcu(); + return (n); +} + +int register_nmi_handler(unsigned int type, nmi_handler_t handler, + unsigned long nmiflags, const char *devname) +{ + struct nmiaction *action; + int retval = -ENOMEM; + + if (!handler) + return -EINVAL; + + action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); + if (!action) + goto fail_action; + + action->handler = handler; + action->flags = nmiflags; + action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); + if (!action->name) + goto fail_action_name; + + retval = __setup_nmi(type, action); + + if (retval) + goto fail_setup_nmi; + + return retval; + +fail_setup_nmi: + kfree(action->name); +fail_action_name: + kfree(action); +fail_action: + + return retval; +} +EXPORT_SYMBOL_GPL(register_nmi_handler); + +void unregister_nmi_handler(unsigned int type, const char *name) +{ + struct nmiaction *a; + + a = __free_nmi(type, name); + if (a) { + kfree(a->name); + kfree(a); + } +} + +EXPORT_SYMBOL_GPL(unregister_nmi_handler); + +static notrace __kprobes void +pci_serr_error(unsigned char reason, struct pt_regs *regs) +{ + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + /* + * On some machines, PCI SERR line is used to report memory + * errors. EDAC makes use of it. + */ +#if defined(CONFIG_EDAC) + if (edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); + + /* Clear and disable the PCI SERR error line. */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); +} + +static notrace __kprobes void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + pr_emerg( + "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + show_registers(regs); + + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); + + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } + + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); +} + +static notrace __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + int handled; + + /* + * Use 'false' as back-to-back NMIs are dealt with one level up. + * Of course this makes having multiple 'unknown' handlers useless + * as only the first one is ever run (unless it can actually determine + * if it caused the NMI) + */ + handled = nmi_handle(NMI_UNKNOWN, regs, false); + if (handled) { + __this_cpu_add(nmi_stats.unknown, handled); + return; + } + + __this_cpu_add(nmi_stats.unknown, 1); + +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif + pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + pr_emerg("Do you have a strange power saving mode enabled?\n"); + if (unknown_nmi_panic || panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); +} + +static DEFINE_PER_CPU(bool, swallow_nmi); +static DEFINE_PER_CPU(unsigned long, last_nmi_rip); + +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int handled; + bool b2b = false; + + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ + + /* + * Back-to-back NMIs are interesting because they can either + * be two NMI or more than two NMIs (any thing over two is dropped + * due to NMI being edge-triggered). If this is the second half + * of the back-to-back NMI, assume we dropped things and process + * more handlers. Otherwise reset the 'swallow' NMI behaviour + */ + if (regs->ip == __this_cpu_read(last_nmi_rip)) + b2b = true; + else + __this_cpu_write(swallow_nmi, false); + + __this_cpu_write(last_nmi_rip, regs->ip); + + handled = nmi_handle(NMI_LOCAL, regs, b2b); + __this_cpu_add(nmi_stats.normal, handled); + if (handled) { + /* + * There are cases when a NMI handler handles multiple + * events in the current NMI. One of these events may + * be queued for in the next NMI. Because the event is + * already handled, the next NMI will result in an unknown + * NMI. Instead lets flag this for a potential NMI to + * swallow. + */ + if (handled > 1) + __this_cpu_write(swallow_nmi, true); + return; + } + + /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ + raw_spin_lock(&nmi_reason_lock); + reason = get_nmi_reason(); + + if (reason & NMI_REASON_MASK) { + if (reason & NMI_REASON_SERR) + pci_serr_error(reason, regs); + else if (reason & NMI_REASON_IOCHK) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active + * meanwhile as it's edge-triggered: + */ + reassert_nmi(); +#endif + __this_cpu_add(nmi_stats.external, 1); + raw_spin_unlock(&nmi_reason_lock); + return; + } + raw_spin_unlock(&nmi_reason_lock); + + /* + * Only one NMI can be latched at a time. To handle + * this we may process multiple nmi handlers at once to + * cover the case where an NMI is dropped. The downside + * to this approach is we may process an NMI prematurely, + * while its real NMI is sitting latched. This will cause + * an unknown NMI on the next run of the NMI processing. + * + * We tried to flag that condition above, by setting the + * swallow_nmi flag when we process more than one event. + * This condition is also only present on the second half + * of a back-to-back NMI, so we flag that condition too. + * + * If both are true, we assume we already processed this + * NMI previously and we swallow it. Otherwise we reset + * the logic. + * + * There are scenarios where we may accidentally swallow + * a 'real' unknown NMI. For example, while processing + * a perf NMI another perf NMI comes in along with a + * 'real' unknown NMI. These two NMIs get combined into + * one (as descibed above). When the next NMI gets + * processed, it will be flagged by perf as handled, but + * noone will know that there was a 'real' unknown NMI sent + * also. As a result it gets swallowed. Or if the first + * perf NMI returns two events handled then the second + * NMI will get eaten by the logic below, again losing a + * 'real' unknown NMI. But this is the best we can do + * for now. + */ + if (b2b && __this_cpu_read(swallow_nmi)) + __this_cpu_add(nmi_stats.swallow, 1); + else + unknown_nmi_error(reason, regs); +} + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + + inc_irq_stat(__nmi_count); + + if (!ignore_nmis) + default_do_nmi(regs); + + nmi_exit(); +} + +void stop_nmi(void) +{ + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; +} + +/* reset the back-to-back NMI logic */ +void local_touch_nmi(void) +{ + __this_cpu_write(last_nmi_rip, 0); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7a3b65107a2..46ff054ebaa 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,6 +57,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> +#include <asm/nmi.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -107,6 +108,7 @@ void cpu_idle(void) if (cpu_is_offline(cpu)) play_dead(); + local_touch_nmi(); local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f693e44e1bf..3bd7e6eebf3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> +#include <asm/nmi.h> asmlinkage extern void ret_from_fork(void); @@ -133,6 +134,7 @@ void cpu_idle(void) * from here on, until they go to idle. * Otherwise, idle callbacks can misfire. */ + local_touch_nmi(); local_irq_disable(); enter_idle(); /* Don't trace irqs off for idle */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9242436e993..e334be1182b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -464,7 +464,7 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct die_args *args) +static void vmxoff_nmi(int cpu, struct pt_regs *regs) { cpu_emergency_vmxoff(); } @@ -736,14 +736,10 @@ static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { int cpu; - if (val != DIE_NMI) - return NOTIFY_OK; - cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. @@ -751,10 +747,10 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return NOTIFY_STOP; + return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, (struct die_args *)data); + shootdown_callback(cpu, regs); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -762,7 +758,7 @@ static int crash_nmi_callback(struct notifier_block *self, for (;;) cpu_relax(); - return 1; + return NMI_HANDLED; } static void smp_send_nmi_allbutself(void) @@ -770,12 +766,6 @@ static void smp_send_nmi_allbutself(void) apic->send_IPI_allbutself(NMI_VECTOR); } -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, - /* we want to be the first one called */ - .priority = NMI_LOCAL_HIGH_PRIOR+1, -}; - /* Halt all other CPUs, calling the specified function on each of them * * This function can be used to halt all other CPUs on crash @@ -794,7 +784,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ - if (register_die_notifier(&crash_nmi_nb)) + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, + NMI_FLAG_FIRST, "crash")) return; /* return what? */ /* Ensure the new callback function is set before sending * out the NMI diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6913369c234..a8e3eb83466 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -81,15 +81,6 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -static int ignore_nmis; - -int unknown_nmi_panic; -/* - * Prevent NMI reason port (0x61) being accessed simultaneously, can - * only be used in NMI handler. - */ -static DEFINE_RAW_SPINLOCK(nmi_reason_lock); - static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -307,152 +298,6 @@ gp_in_kernel: die("general protection fault", regs, error_code); } -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - -static notrace __kprobes void -pci_serr_error(unsigned char reason, struct pt_regs *regs) -{ - pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - pr_emerg("Dazed and confused, but trying to continue\n"); - - /* Clear and disable the PCI SERR error line. */ - reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; - outb(reason, NMI_REASON_PORT); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - unsigned long i; - - pr_emerg( - "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", - reason, smp_processor_id()); - show_registers(regs); - - if (panic_on_io_nmi) - panic("NMI IOCK error: Not continuing"); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; - outb(reason, NMI_REASON_PORT); - - i = 20000; - while (--i) { - touch_nmi_watchdog(); - udelay(100); - } - - reason &= ~NMI_REASON_CLEAR_IOCHK; - outb(reason, NMI_REASON_PORT); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - pr_emerg("Do you have a strange power saving mode enabled?\n"); - if (unknown_nmi_panic || panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - pr_emerg("Dazed and confused, but trying to continue\n"); -} - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - - /* - * CPU-specific NMI must be processed before non-CPU-specific - * NMI, otherwise we may lose it, because the CPU-specific - * NMI can not be detected/processed on other CPUs. - */ - if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ - raw_spin_lock(&nmi_reason_lock); - reason = get_nmi_reason(); - - if (reason & NMI_REASON_MASK) { - if (reason & NMI_REASON_SERR) - pci_serr_error(reason, regs); - else if (reason & NMI_REASON_IOCHK) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active - * meanwhile as it's edge-triggered: - */ - reassert_nmi(); -#endif - raw_spin_unlock(&nmi_reason_lock); - return; - } - raw_spin_unlock(&nmi_reason_lock); - - unknown_nmi_error(reason, regs); -} - -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - - inc_irq_stat(__nmi_count); - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; -} - /* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 68894fdc034..c04dc145a4b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -61,26 +61,15 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, } -static int profile_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - switch (val) { - case DIE_NMI: - if (ctr_running) - model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); - else if (!nmi_enabled) - break; - else - model->stop(&__get_cpu_var(cpu_msrs)); - ret = NOTIFY_STOP; - break; - default: - break; - } - return ret; + if (ctr_running) + model->check_ctrs(regs, &__get_cpu_var(cpu_msrs)); + else if (!nmi_enabled) + return NMI_DONE; + else + model->stop(&__get_cpu_var(cpu_msrs)); + return NMI_HANDLED; } static void nmi_cpu_save_registers(struct op_msrs *msrs) @@ -363,12 +352,6 @@ static void nmi_cpu_setup(void *dummy) apic_write(APIC_LVTPC, APIC_DM_NMI); } -static struct notifier_block profile_exceptions_nb = { - .notifier_call = profile_exceptions_notify, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; @@ -402,8 +385,6 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); - if (model->cpu_down) - model->cpu_down(); } static void nmi_cpu_up(void *dummy) @@ -508,7 +489,8 @@ static int nmi_setup(void) ctr_running = 0; /* make variables visible to the nmi handler: */ smp_mb(); - err = register_die_notifier(&profile_exceptions_nb); + err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify, + 0, "oprofile"); if (err) goto fail; @@ -538,7 +520,7 @@ static void nmi_shutdown(void) put_online_cpus(); /* make variables visible to the nmi handler: */ smp_mb(); - unregister_die_notifier(&profile_exceptions_nb); + unregister_nmi_handler(NMI_LOCAL, "oprofile"); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c index 720bf5a53c5..7f8052cd662 100644 --- a/arch/x86/oprofile/nmi_timer_int.c +++ b/arch/x86/oprofile/nmi_timer_int.c @@ -18,32 +18,16 @@ #include <asm/apic.h> #include <asm/ptrace.h> -static int profile_timer_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - switch (val) { - case DIE_NMI: - oprofile_add_sample(args->regs, 0); - ret = NOTIFY_STOP; - break; - default: - break; - } - return ret; + oprofile_add_sample(regs, 0); + return NMI_HANDLED; } -static struct notifier_block profile_timer_exceptions_nb = { - .notifier_call = profile_timer_exceptions_notify, - .next = NULL, - .priority = NMI_LOW_PRIOR, -}; - static int timer_start(void) { - if (register_die_notifier(&profile_timer_exceptions_nb)) + if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify, + 0, "oprofile-timer")) return 1; return 0; } @@ -51,7 +35,7 @@ static int timer_start(void) static void timer_stop(void) { - unregister_die_notifier(&profile_timer_exceptions_nb); + unregister_nmi_handler(NMI_LOCAL, "oprofile-timer"); synchronize_sched(); /* Allow already-started NMIs to complete. */ } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 9cbb710dc94..303f0863782 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -29,8 +29,6 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 -#define NUM_COUNTERS_F15H 6 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX #define NUM_VIRT_COUNTERS 32 #else @@ -70,62 +68,12 @@ static struct ibs_config ibs_config; static struct ibs_state ibs_state; /* - * IBS cpuid feature detection - */ - -#define IBS_CPUID_FEATURES 0x8000001b - -/* - * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but - * bit 0 is used to indicate the existence of IBS. - */ -#define IBS_CAPS_AVAIL (1U<<0) -#define IBS_CAPS_FETCHSAM (1U<<1) -#define IBS_CAPS_OPSAM (1U<<2) -#define IBS_CAPS_RDWROPCNT (1U<<3) -#define IBS_CAPS_OPCNT (1U<<4) -#define IBS_CAPS_BRNTRGT (1U<<5) -#define IBS_CAPS_OPCNTEXT (1U<<6) - -#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ - | IBS_CAPS_FETCHSAM \ - | IBS_CAPS_OPSAM) - -/* - * IBS APIC setup - */ -#define IBSCTL 0x1cc -#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) -#define IBSCTL_LVT_OFFSET_MASK 0x0F - -/* * IBS randomization macros */ #define IBS_RANDOM_BITS 12 #define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) #define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) -static u32 get_ibs_caps(void) -{ - u32 ibs_caps; - unsigned int max_level; - - if (!boot_cpu_has(X86_FEATURE_IBS)) - return 0; - - /* check IBS cpuid feature flags */ - max_level = cpuid_eax(0x80000000); - if (max_level < IBS_CPUID_FEATURES) - return IBS_CAPS_DEFAULT; - - ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); - if (!(ibs_caps & IBS_CAPS_AVAIL)) - /* cpuid flags not valid */ - return IBS_CAPS_DEFAULT; - - return ibs_caps; -} - /* * 16-bit Linear Feedback Shift Register (LFSR) * @@ -316,81 +264,6 @@ static void op_amd_stop_ibs(void) wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -static inline int get_eilvt(int offset) -{ - return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); -} - -static inline int put_eilvt(int offset) -{ - return !setup_APIC_eilvt(offset, 0, 0, 1); -} - -static inline int ibs_eilvt_valid(void) -{ - int offset; - u64 val; - int valid = 0; - - preempt_disable(); - - rdmsrl(MSR_AMD64_IBSCTL, val); - offset = val & IBSCTL_LVT_OFFSET_MASK; - - if (!(val & IBSCTL_LVT_OFFSET_VALID)) { - pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", - smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - goto out; - } - - if (!get_eilvt(offset)) { - pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", - smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - goto out; - } - - valid = 1; -out: - preempt_enable(); - - return valid; -} - -static inline int get_ibs_offset(void) -{ - u64 val; - - rdmsrl(MSR_AMD64_IBSCTL, val); - if (!(val & IBSCTL_LVT_OFFSET_VALID)) - return -EINVAL; - - return val & IBSCTL_LVT_OFFSET_MASK; -} - -static void setup_APIC_ibs(void) -{ - int offset; - - offset = get_ibs_offset(); - if (offset < 0) - goto failed; - - if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) - return; -failed: - pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n", - smp_processor_id()); -} - -static void clear_APIC_ibs(void) -{ - int offset; - - offset = get_ibs_offset(); - if (offset >= 0) - setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); -} - #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, @@ -439,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - if (num_counters == NUM_COUNTERS_F15H) { + if (num_counters == AMD64_NUM_COUNTERS_F15H) { msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); } else { @@ -504,15 +377,6 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, val |= op_x86_get_ctrl(model, &counter_config[virt]); wrmsrl(msrs->controls[i].addr, val); } - - if (ibs_caps) - setup_APIC_ibs(); -} - -static void op_amd_cpu_shutdown(void) -{ - if (ibs_caps) - clear_APIC_ibs(); } static int op_amd_check_ctrs(struct pt_regs * const regs, @@ -575,86 +439,6 @@ static void op_amd_stop(struct op_msrs const * const msrs) op_amd_stop_ibs(); } -static int setup_ibs_ctl(int ibs_eilvt_off) -{ - struct pci_dev *cpu_cfg; - int nodes; - u32 value = 0; - - nodes = 0; - cpu_cfg = NULL; - do { - cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, - cpu_cfg); - if (!cpu_cfg) - break; - ++nodes; - pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVT_OFFSET_VALID); - pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { - pci_dev_put(cpu_cfg); - printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x\n", value); - return -EINVAL; - } - } while (1); - - if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS\n"); - return -ENODEV; - } - - return 0; -} - -/* - * This runs only on the current cpu. We try to find an LVT offset and - * setup the local APIC. For this we must disable preemption. On - * success we initialize all nodes with this offset. This updates then - * the offset in the IBS_CTL per-node msr. The per-core APIC setup of - * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_- - * amd_cpu_shutdown() using the new offset. - */ -static int force_ibs_eilvt_setup(void) -{ - int offset; - int ret; - - preempt_disable(); - /* find the next free available EILVT entry, skip offset 0 */ - for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { - if (get_eilvt(offset)) - break; - } - preempt_enable(); - - if (offset == APIC_EILVT_NR_MAX) { - printk(KERN_DEBUG "No EILVT entry available\n"); - return -EBUSY; - } - - ret = setup_ibs_ctl(offset); - if (ret) - goto out; - - if (!ibs_eilvt_valid()) { - ret = -EFAULT; - goto out; - } - - pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); - pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); - - return 0; -out: - preempt_disable(); - put_eilvt(offset); - preempt_enable(); - return ret; -} - /* * check and reserve APIC extended interrupt LVT offset for IBS if * available @@ -667,17 +451,6 @@ static void init_ibs(void) if (!ibs_caps) return; - if (ibs_eilvt_valid()) - goto out; - - if (!force_ibs_eilvt_setup()) - goto out; - - /* Failed to setup ibs */ - ibs_caps = 0; - return; - -out: printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } @@ -741,9 +514,9 @@ static int op_amd_init(struct oprofile_operations *ops) ops->create_files = setup_ibs_files; if (boot_cpu_data.x86 == 0x15) { - num_counters = NUM_COUNTERS_F15H; + num_counters = AMD64_NUM_COUNTERS_F15H; } else { - num_counters = NUM_COUNTERS; + num_counters = AMD64_NUM_COUNTERS; } op_amd_spec.num_counters = num_counters; @@ -760,7 +533,6 @@ struct op_x86_model_spec op_amd_spec = { .init = op_amd_init, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, - .cpu_down = &op_amd_cpu_shutdown, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 89017fa1fd6..71e8a67337e 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -43,7 +43,6 @@ struct op_x86_model_spec { int (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_x86_model_spec const *model, struct op_msrs const * const msrs); - void (*cpu_down)(void); int (*check_ctrs)(struct pt_regs * const regs, struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs); diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 0784f99a466..b8e08cb67a1 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -50,6 +50,7 @@ #include <acpi/hed.h> #include <asm/mce.h> #include <asm/tlbflush.h> +#include <asm/nmi.h> #include "apei-internal.h" @@ -749,15 +750,11 @@ static void ghes_proc_in_irq(struct irq_work *irq_work) } } -static int ghes_notify_nmi(struct notifier_block *this, - unsigned long cmd, void *data) +static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) { struct ghes *ghes, *ghes_global = NULL; int sev, sev_global = -1; - int ret = NOTIFY_DONE; - - if (cmd != DIE_NMI) - return ret; + int ret = NMI_DONE; raw_spin_lock(&ghes_nmi_lock); list_for_each_entry_rcu(ghes, &ghes_nmi, list) { @@ -770,10 +767,10 @@ static int ghes_notify_nmi(struct notifier_block *this, sev_global = sev; ghes_global = ghes; } - ret = NOTIFY_STOP; + ret = NMI_HANDLED; } - if (ret == NOTIFY_DONE) + if (ret == NMI_DONE) goto out; if (sev_global >= GHES_SEV_PANIC) { @@ -825,10 +822,6 @@ static struct notifier_block ghes_notifier_sci = { .notifier_call = ghes_notify_sci, }; -static struct notifier_block ghes_notifier_nmi = { - .notifier_call = ghes_notify_nmi, -}; - static unsigned long ghes_esource_prealloc_size( const struct acpi_hest_generic *generic) { @@ -918,7 +911,8 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) ghes_estatus_pool_expand(len); mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_nmi)) - register_die_notifier(&ghes_notifier_nmi); + register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, + "ghes"); list_add_rcu(&ghes->list, &ghes_nmi); mutex_unlock(&ghes_list_mutex); break; @@ -964,7 +958,7 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); if (list_empty(&ghes_nmi)) - unregister_die_notifier(&ghes_notifier_nmi); + unregister_nmi_handler(NMI_LOCAL, "ghes"); mutex_unlock(&ghes_list_mutex); /* * To synchronize with NMI handler, ghes can only be diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 3302586655c..c2917ffad2c 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -65,6 +65,7 @@ * mechanism for it at that time. */ #include <asm/kdebug.h> +#include <asm/nmi.h> #define HAVE_DIE_NMI #endif @@ -1077,17 +1078,8 @@ static void ipmi_unregister_watchdog(int ipmi_intf) #ifdef HAVE_DIE_NMI static int -ipmi_nmi(struct notifier_block *self, unsigned long val, void *data) +ipmi_nmi(unsigned int val, struct pt_regs *regs) { - struct die_args *args = data; - - if (val != DIE_NMIUNKNOWN) - return NOTIFY_OK; - - /* Hack, if it's a memory or I/O error, ignore it. */ - if (args->err & 0xc0) - return NOTIFY_OK; - /* * If we get here, it's an NMI that's not a memory or I/O * error. We can't truly tell if it's from IPMI or not @@ -1097,15 +1089,15 @@ ipmi_nmi(struct notifier_block *self, unsigned long val, void *data) if (testing_nmi) { testing_nmi = 2; - return NOTIFY_STOP; + return NMI_HANDLED; } /* If we are not expecting a timeout, ignore it. */ if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE) - return NOTIFY_OK; + return NMI_DONE; if (preaction_val != WDOG_PRETIMEOUT_NMI) - return NOTIFY_OK; + return NMI_DONE; /* * If no one else handled the NMI, we assume it was the IPMI @@ -1120,12 +1112,8 @@ ipmi_nmi(struct notifier_block *self, unsigned long val, void *data) panic(PFX "pre-timeout"); } - return NOTIFY_STOP; + return NMI_HANDLED; } - -static struct notifier_block ipmi_nmi_handler = { - .notifier_call = ipmi_nmi -}; #endif static int wdog_reboot_handler(struct notifier_block *this, @@ -1290,7 +1278,8 @@ static void check_parms(void) } } if (do_nmi && !nmi_handler_registered) { - rv = register_die_notifier(&ipmi_nmi_handler); + rv = register_nmi_handler(NMI_UNKNOWN, ipmi_nmi, 0, + "ipmi"); if (rv) { printk(KERN_WARNING PFX "Can't register nmi handler\n"); @@ -1298,7 +1287,7 @@ static void check_parms(void) } else nmi_handler_registered = 1; } else if (!do_nmi && nmi_handler_registered) { - unregister_die_notifier(&ipmi_nmi_handler); + unregister_nmi_handler(NMI_UNKNOWN, "ipmi"); nmi_handler_registered = 0; } #endif @@ -1336,7 +1325,7 @@ static int __init ipmi_wdog_init(void) if (rv) { #ifdef HAVE_DIE_NMI if (nmi_handler_registered) - unregister_die_notifier(&ipmi_nmi_handler); + unregister_nmi_handler(NMI_UNKNOWN, "ipmi"); #endif atomic_notifier_chain_unregister(&panic_notifier_list, &wdog_panic_notifier); @@ -1357,7 +1346,7 @@ static void __exit ipmi_wdog_exit(void) #ifdef HAVE_DIE_NMI if (nmi_handler_registered) - unregister_die_notifier(&ipmi_nmi_handler); + unregister_nmi_handler(NMI_UNKNOWN, "ipmi"); #endif atomic_notifier_chain_unregister(&panic_notifier_list, diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 809cbda03d7..3774c9b8dac 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -35,6 +35,7 @@ #include <linux/notifier.h> #include <asm/cacheflush.h> #endif /* CONFIG_HPWDT_NMI_DECODING */ +#include <asm/nmi.h> #define HPWDT_VERSION "1.3.0" #define SECS_TO_TICKS(secs) ((secs) * 1000 / 128) @@ -477,15 +478,11 @@ static int hpwdt_time_left(void) /* * NMI Handler */ -static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason, - void *data) +static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs) { unsigned long rom_pl; static int die_nmi_called; - if (ulReason != DIE_NMIUNKNOWN) - goto out; - if (!hpwdt_nmi_decoding) goto out; @@ -508,7 +505,7 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason, "Management Log for details.\n"); out: - return NOTIFY_OK; + return NMI_DONE; } #endif /* CONFIG_HPWDT_NMI_DECODING */ @@ -648,13 +645,6 @@ static struct miscdevice hpwdt_miscdev = { .fops = &hpwdt_fops, }; -#ifdef CONFIG_HPWDT_NMI_DECODING -static struct notifier_block die_notifier = { - .notifier_call = hpwdt_pretimeout, - .priority = 0, -}; -#endif /* CONFIG_HPWDT_NMI_DECODING */ - /* * Init & Exit */ @@ -740,10 +730,9 @@ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev) * die notify list to handle a critical NMI. The default is to * be last so other users of the NMI signal can function. */ - if (priority) - die_notifier.priority = 0x7FFFFFFF; - - retval = register_die_notifier(&die_notifier); + retval = register_nmi_handler(NMI_UNKNOWN, hpwdt_pretimeout, + (priority) ? NMI_FLAG_FIRST : 0, + "hpwdt"); if (retval != 0) { dev_warn(&dev->dev, "Unable to register a die notifier (err=%d).\n", @@ -763,7 +752,7 @@ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev) static void hpwdt_exit_nmi_decoding(void) { - unregister_die_notifier(&die_notifier); + unregister_nmi_handler(NMI_UNKNOWN, "hpwdt"); if (cru_rom_addr) iounmap(cru_rom_addr); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c816075c01c..1e9ebe5e009 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -220,7 +220,10 @@ struct perf_event_attr { mmap_data : 1, /* non-exec mmap data */ sample_id_all : 1, /* sample_type all events */ - __reserved_1 : 45; + exclude_host : 1, /* don't count in host */ + exclude_guest : 1, /* don't count in guest */ + + __reserved_1 : 43; union { __u32 wakeup_events; /* wakeup every n events */ |