From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:27 +0100 Subject: x86, mce: don't disable machine checks during code patching Impact: low priority bug fix This removes part of a a patch I added myself some time ago. After some consideration the patch was a bad idea. In particular it stopped machine check exceptions during code patching. To quote the comment: * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than a unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. So undo the machine check related parts of 8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled. This only removes code, the only additions are a new comment. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 32c6e17b960..5522273a3ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -120,8 +120,6 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif -extern void stop_mce(void); -extern void restart_mce(void); #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ -- cgit v1.2.3-70-g09d2 From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:22 +0100 Subject: x86, mce: factor out duplicated struct mce setup into one function Impact: cleanup This merely factors out duplicated code to set up the initial struct mce state into a single function. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 ++++++++++++++--------- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5522273a3ad..048b71d9387 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -90,6 +90,7 @@ extern int mce_disabled; #include +void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -106,7 +107,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -void mce_log_therm_throt_event(unsigned int cpu, __u64 status); +void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 2297730bb51..fed875742b1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) || !banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) @@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); + if (error_code < 0) + m.tsc = 0; if (error_code != -2) mce_log(&m); @@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 8ae8c4ff094..75d9dd25e3d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void) exit_idle(); irq_enter(); - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); + mce_setup(&m); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 4b48f251fd3..7f7f1015ef1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); + mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); -- cgit v1.2.3-70-g09d2 From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:23 +0100 Subject: x86, mce: separate correct machine check poller and fatal exception handler Impact: cleanup, performance enhancement The machine check poller is diverging more and more from the fatal exception handler. Instead of adding more special cases separate the code paths completely. The corrected poll path is actually quite simple, and this doesn't result in much code duplication. This makes both handlers much easier to read and results in cleaner code flow. The exception handler now only needs to care about uncorrected errors, which also simplifies the handling of multiple errors. The corrected poller also now always runs in standard interrupt context and does not need to do anything special to handle NMI context. Minor behaviour changes: - MCG status is now not cleared on polling. - Only the banks which had corrected errors get cleared on polling - The exception handler only clears banks with errors now v2: Forward port to new patch order. Add "uc" argument. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 7 ++ arch/x86/kernel/cpu/mcheck/mce_64.c | 129 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- 3 files changed, 116 insertions(+), 22 deletions(-) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 048b71d9387..225cdb5d2bf 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -112,6 +112,13 @@ void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); + +enum mcp_flags { + MCP_TIMESTAMP = (1 << 0), /* log time stamp */ + MCP_UC = (1 << 1), /* log uncorrected errors */ +}; +extern void machine_check_poll(enum mcp_flags flags); + extern int mce_notify_user(void); #endif /* !CONFIG_X86_32 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fed875742b1..268b05edade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include @@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; mce_setup(&m); @@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); if (!bank[i]) continue; @@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code) if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code < 0) - m.tsc = 0; - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -494,9 +580,10 @@ static void mce_init(void *dummy) u64 cap; int i; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + /* + * Log the machine checks left over from the previous reset. + */ + machine_check_poll(MCP_UC); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 75d9dd25e3d..0069c653f4e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); -- cgit v1.2.3-70-g09d2 From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:30 +0100 Subject: x86, mce, cmci: export MAX_NR_BANKS Impact: Cleanup (code movement) Move MAX_NR_BANKS into mce.h because it's needed there for followup patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 6 ++++++ arch/x86/kernel/cpu/mcheck/mce_64.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 225cdb5d2bf..39136c497c5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -95,6 +95,12 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) + #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a4a7c686ce9..39f8bb525a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,12 +37,6 @@ #define MISC_MCELOG_MINOR 227 -/* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. - */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) - atomic_t mce_entry; static int mce_dont_init; -- cgit v1.2.3-70-g09d2 From b276268631af3a1b0df871e10d19d492f0513d4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:31 +0100 Subject: x86, mce, cmci: factor out threshold interrupt handler Impact: cleanup; preparation for feature The mce_amd_64 code has an own private MC threshold vector with an own interrupt handler. Since Intel needs a similar handler it makes sense to share the vector because both can not be active at the same time. I factored the common APIC handler code into a separate file which can be used by both the Intel or AMD MC code. This is needed for the next patch which adds an Intel specific CMCI handler. This patch should be a nop for AMD, it just moves some code around. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 5 +++++ arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 15 ++++++--------- arch/x86/kernel/cpu/mcheck/threshold.c | 24 ++++++++++++++++++++++++ 5 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/threshold.c (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9c39095b33f..52d7013785f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -751,6 +751,11 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_MCE_THRESHOLD + depends on X86_MCE_AMD || X86_MCE_INTEL + bool + default y + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_32 && X86_MCE diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 39136c497c5..125cd871462 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -135,5 +135,7 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #define mcheck_init(c) do { } while (0) #endif +extern void (*mce_threshold_vector)(void); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb6..b2f89829bbe 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index e82c8208b81..49705be9820 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static void amd_threshold_interrupt(void); + /* * CPU Initialization */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; } } } @@ -187,16 +191,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - mce_setup(&m); /* assume first bank caused it */ @@ -241,13 +241,10 @@ asmlinkage void mce_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - goto out; + return; } } } -out: - inc_irq_stat(irq_threshold_count); - irq_exit(); } /* diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..4319142413d --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,24 @@ +/* Common corrected MCE threshold handler code */ +#include +#include +#include +#include +#include + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + irq_exit(); +} -- cgit v1.2.3-70-g09d2 From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:34 +0100 Subject: x86, mce, cmci: use polled banks bitmap in machine check poller Define a per cpu bitmap that contains the banks polled by the machine check poller. This is needed for the CMCI code in the next patches to be able to disable polling on specific banks. The bank by default contains all banks, so there is no behaviour change. Only future code will remove some banks from the polling set. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 5 ++++- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 3 ++- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 125cd871462..9b9523699db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -119,11 +119,14 @@ extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); +typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); +DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); + enum mcp_flags { MCP_TIMESTAMP = (1 << 0), /* log time stamp */ MCP_UC = (1 << 1), /* log uncorrected errors */ }; -extern void machine_check_poll(enum mcp_flags flags); +extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); extern int mce_notify_user(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 9017609cadd..a8ff38bfa6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) * * This is executed in standard interrupt context. */ -void machine_check_poll(enum mcp_flags flags) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; @@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags) rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (!bank[i] || !test_bit(i, *b)) continue; m.misc = 0; @@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -572,11 +578,13 @@ static void mce_init(void *dummy) { u64 cap; int i; + mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. */ - machine_check_poll(MCP_UC); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 49705be9820..ee8bfcd3aa3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,8 @@ static void amd_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); -- cgit v1.2.3-70-g09d2 From 03195c6b40f2b4db92545921daa7c3a19b4e4c32 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:35 +0100 Subject: x86, mce, cmci: define MSR names and fields for new CMCI registers Impact: New register definitions only CMCI means support for raising an interrupt on a corrected machine check event instead of having to poll for it. It's a new feature in Intel Nehalem CPUs available on some machine check banks. For details see the IA32 SDM Vol3a 14.5 Define the registers for it as a preparation for further patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/apicdef.h | 1 + arch/x86/include/asm/mce.h | 2 ++ arch/x86/include/asm/msr-index.h | 5 +++++ 3 files changed, 8 insertions(+) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b..bc9514fb3b1 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -53,6 +53,7 @@ #define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_ILLREGA 0x00080 +#define APIC_LVTCMCI 0x2f0 #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9b9523699db..6fc5e07eca4 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -11,6 +11,8 @@ */ #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae0..2dbd2314139 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -77,6 +77,11 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define CMCI_EN (1ULL << 30) +#define CMCI_THRESHOLD_MASK 0xffffULL + #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 -- cgit v1.2.3-70-g09d2 From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:36 +0100 Subject: x86, mce, cmci: add CMCI support Impact: Major new feature Intel CMCI (Corrected Machine Check Interrupt) is a new feature on Nehalem CPUs. It allows the CPU to trigger interrupts on corrected events, which allows faster reaction to them instead of with the traditional polling timer. Also use CMCI to discover shared banks. Machine check banks can be shared by CPU threads or even cores. Using the CMCI enable bit it is possible to detect the fact that another CPU already saw a specific bank. Use this to assign shared banks only to one CPU to avoid reporting duplicated events. On CPU hot unplug bank sharing is re discovered. This is done using a thread that cycles through all the CPUs. To avoid races between the poller and CMCI we only poll for banks that are not CMCI capable and only check CMCI owned banks on a interrupt. The shared banks ownership information is currently only used for CMCI interrupts, not polled banks. The sharing discovery code follows the algorithm recommended in the IA32 SDM Vol3a 14.5.2.1 The CMCI interrupt handler just calls the machine check poller to pick up the machine check event that caused the interrupt. I decided not to implement a separate threshold event like the AMD version has, because the threshold is always one currently and adding another event didn't seem to add any value. Some code inspired by Yunhong Jiang's Xen implementation, which was in term inspired by a earlier CMCI implementation by me. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 ++ arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 205 ++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm/mce.h') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6fc5e07eca4..563933e06a3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -105,8 +105,16 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void cmci_clear(void); +void cmci_reenable(void); +void cmci_rediscover(int dying); +void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void cmci_clear(void) {} +static inline void cmci_reenable(void) {} +static inline void cmci_rediscover(int dying) {} +static inline void cmci_recheck(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -115,6 +123,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif +extern int mce_available(struct cpuinfo_x86 *c); + void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a8ff38bfa6e..bfbd5323a63 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { if (mce_dont_init) return 0; @@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) static void mce_disable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } @@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); } @@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); break; } return NOTIFY_OK; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 1b1491a76b5..a518ec8c6f8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,6 +1,8 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen */ #include @@ -12,6 +14,7 @@ #include #include #include +#include asmlinkage void smp_thermal_interrupt(void) { @@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static __cpuinit int cmci_supported(int *banks) +{ + u64 cap; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static __cpuinit void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + int hdr = 0; + int i; + + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock(&cmci_discover_lock); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +__cpuinit void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void __cpuexit cmci_clear(void) +{ + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void __cpuexit cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu (cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); + intel_init_cmci(); } -- cgit v1.2.3-70-g09d2