From 4b737d78a8081cb2def7ced262a212c31363539a Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Tue, 23 Sep 2014 10:16:01 +0800 Subject: x86, MCE, AMD: Use macros to compute bank MSRs Avoid open coded calculations for bank MSRs by hiding the index of higher bank MSRs in well-defined macros. No semantic changes. Signed-off-by: Chen Yucong Link: http://lkml.kernel.org/r/1411438561-24319-1-git-send-email-slaoub@gmail.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5d4999f95ae..f8c56bd7b74 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -217,7 +217,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; + address = MSR_IA32_MCx_MISC(bank); else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) @@ -281,7 +281,7 @@ static void amd_threshold_interrupt(void) continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) { - address = MSR_IA32_MC0_MISC + bank * 4; + address = MSR_IA32_MCx_MISC(bank); } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) @@ -314,8 +314,7 @@ static void amd_threshold_interrupt(void) if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, - m.status); + rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); m.bank = K8_MCE_THRESHOLD_BASE + bank * NR_BLOCKS + block; @@ -617,8 +616,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); + err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); if (!err) goto out; -- cgit v1.2.3-70-g09d2 From 44612a3ac671d7b9a9b987ab73dcc776204ac4d5 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Thu, 2 Oct 2014 14:48:19 +0200 Subject: x86, MCE, AMD: Correct thresholding error logging mce_setup() does not gather the content of IA32_MCG_STATUS, so it should be read explicitly. Moreover, we need to clear IA32_MCx_STATUS to avoid that mce_log() logs the processed threshold event again at next time. But we do the logging ourselves and machine_check_poll() is completely useless there. So kill it. Signed-off-by: Chen Yucong Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f8c56bd7b74..9ce64955559 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -270,14 +270,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; + int cpu = smp_processor_id(); unsigned int bank, block; struct mce m; - mce_setup(&m); - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) { @@ -309,20 +308,21 @@ static void amd_threshold_interrupt(void) * Log the machine check that caused the threshold * event. */ - machine_check_poll(MCP_TIMESTAMP, - this_cpu_ptr(&mce_poll_banks)); - - if (high & MASK_OVERFLOW_HI) { - rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - m.bank = K8_MCE_THRESHOLD_BASE - + bank * NR_BLOCKS - + block; - mce_log(&m); - return; - } + if (high & MASK_OVERFLOW_HI) + goto log; } } + return; + +log: + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); + m.bank = K8_MCE_THRESHOLD_BASE + bank * NR_BLOCKS + block; + mce_log(&m); + + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } /* -- cgit v1.2.3-70-g09d2 From 69b957583580bf40624553c64d802fefb54199cb Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Thu, 2 Oct 2014 23:20:12 +0800 Subject: x86, MCE, AMD: Move invariant code out from loop body Assigning to mce_threshold_vector is loop-invariant code in mce_amd_feature_init(). So do it only once, out of loop body. Signed-off-by: Chen Yucong Link: http://lkml.kernel.org/r/1412263212.8085.6.camel@debian [ Boris: commit message corrections. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9ce64955559..9af7bd74828 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -253,7 +253,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) } mce_threshold_block_init(&b, offset); - mce_threshold_vector = amd_threshold_interrupt; + + if (mce_threshold_vector != amd_threshold_interrupt) + mce_threshold_vector = amd_threshold_interrupt; } } } -- cgit v1.2.3-70-g09d2 From a3a529d104ec5149fb9a667dce988635941be1ed Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 21 Oct 2014 22:19:59 +0200 Subject: x86, MCE, AMD: Drop software-defined bank in error thresholding Aravind had the good question about why we're assigning a software-defined bank when reporting error thresholding errors instead of simply using the bank which reports the last error causing the overflow. Digging through git history, it pointed to 95268664390b ("[PATCH] x86_64: mce_amd support for family 0x10 processors") which added that functionality. The problem with this, however, is that tools don't know about software-defined banks and get puzzled. So drop that K8_MCE_THRESHOLD_BASE and simply use the hw bank reporting the thresholding interrupt. Save us a couple of MSR reads while at it. Reported-by: Aravind Gopalakrishnan Link: https://lkml.kernel.org/r/5435B206.60402@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 1 - arch/x86/kernel/cpu/mcheck/mce_amd.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 958b90f761e..276392f121f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -78,7 +78,6 @@ /* Software defined banks */ #define MCE_EXTENDED_BANK 128 #define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9af7bd74828..6606523ff1c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -318,10 +318,9 @@ static void amd_threshold_interrupt(void) log: mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - rdmsrl(address, m.misc); rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - m.bank = K8_MCE_THRESHOLD_BASE + bank * NR_BLOCKS + block; + m.misc = ((u64)high << 32) | low; + m.bank = bank; mce_log(&m); wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); -- cgit v1.2.3-70-g09d2 From 8dcf32ea220d87ca517e164de85d336480c9d172 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Sat, 1 Nov 2014 11:23:32 +0100 Subject: x86, MCE, AMD: Assign interrupt handler only when bank supports it There are some AMD CPU models which have thresholding banks but which cannot generate a thresholding interrupt. This is denoted by the bit MCi_MISC[IntP]. Make sure to check that bit before assigning the thresholding interrupt handler. Signed-off-by: Chen Yucong [ Boris: save an indentation level and rewrite commit message. ] Link: http://lkml.kernel.org/r/1412662128.28440.18.camel@debian Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 6606523ff1c..f1c3769bbd6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -212,7 +212,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - int offset = -1; + int offset = -1, new; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -247,15 +247,18 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.address = address; b.interrupt_capable = lvt_interrupt_supported(bank, high); - if (b.interrupt_capable) { - int new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); - } + if (!b.interrupt_capable) + goto init; - mce_threshold_block_init(&b, offset); + new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); - if (mce_threshold_vector != amd_threshold_interrupt) + if ((offset == new) && + (mce_threshold_vector != amd_threshold_interrupt)) mce_threshold_vector = amd_threshold_interrupt; + +init: + mce_threshold_block_init(&b, offset); } } } -- cgit v1.2.3-70-g09d2 From e3480271f59253cb60d030aa5e615bf00b731fea Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Tue, 18 Nov 2014 10:09:19 +0800 Subject: x86, mce, severity: Extend the the mce_severity mechanism to handle UCNA/DEFERRED error Until now, the mce_severity mechanism can only identify the severity of UCNA error as MCE_KEEP_SEVERITY. Meanwhile, it is not able to filter out DEFERRED error for AMD platform. This patch extends the mce_severity mechanism for handling UCNA/DEFERRED error. In order to do this, the patch introduces a new severity level - MCE_UCNA/DEFERRED_SEVERITY. In addition, mce_severity is specific to machine check exception, and it will check MCIP/EIPV/RIPV bits. In order to use mce_severity mechanism in non-exception context, the patch also introduces a new argument (is_excp) for mce_severity. `is_excp' is used to explicitly specify the calling context of mce_severity. Reviewed-by: Aravind Gopalakrishnan Signed-off-by: Chen Yucong Signed-off-by: Tony Luck --- arch/x86/include/asm/mce.h | 4 ++++ arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 +++- arch/x86/kernel/cpu/mcheck/mce-severity.c | 23 +++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce.c | 14 ++++++++------ drivers/edac/mce_amd.h | 3 --- 5 files changed, 32 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 276392f121f..51b26e89593 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -34,6 +34,10 @@ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ +/* AMD-specific bits */ +#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ +#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ + /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is * bits 15:0. But bit 12 is the 'F' bit, defined for corrected diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fe..10b46906767 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -3,6 +3,8 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, MCE_AO_SEVERITY, @@ -21,7 +23,7 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg); +int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468..8bb433043a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -31,6 +31,7 @@ enum context { IN_KERNEL = 1, IN_USER = 2 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; static struct severity { u64 mask; @@ -40,6 +41,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char excp; unsigned char covered; char *msg; } severities[] = { @@ -48,6 +50,8 @@ static struct severity { #define USER .context = IN_USER #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP #define BITCLR(x) .mask = x, .result = 0 #define BITSET(x) .mask = x, .result = x #define MCGMASK(x, y) .mcgmask = x, .mcgres = y @@ -62,7 +66,7 @@ static struct severity { ), MCESEV( NO, "Not enabled", - BITCLR(MCI_STATUS_EN) + EXCP, BITCLR(MCI_STATUS_EN) ), MCESEV( PANIC, "Processor context corrupt", @@ -71,16 +75,20 @@ static struct severity { /* When MCIP is not set something is very confused */ MCESEV( PANIC, "MCIP not set in MCA handler", - MCGMASK(MCG_STATUS_MCIP, 0) + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ MCESEV( PANIC, "Neither restart nor error IP", - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", - KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + DEFERRED, "Deferred error", + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), MCESEV( KEEP, "Corrected error", @@ -89,7 +97,7 @@ static struct severity { /* ignore OVER for UCNA */ MCESEV( - KEEP, "Uncorrected no action required", + UCNA, "Uncorrected no action required", SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), MCESEV( @@ -178,8 +186,9 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) { + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if (s->context && ctx != s->context) continue; + if (s->excp && excp != s->excp) + continue; if (msg) *msg = s->msg; s->covered = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebf..453e9bf9096 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -668,7 +668,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + if (mce_severity(m, mca_cfg.tolerant, msg, true) >= + MCE_PANIC_SEVERITY) ret = 1; } return ret; @@ -754,7 +755,7 @@ static void mce_reign(void) for_each_possible_cpu(cpu) { int severity = mce_severity(&per_cpu(mces_seen, cpu), mca_cfg.tolerant, - &nmsg); + &nmsg, true); if (severity > global_worst) { msg = nmsg; global_worst = severity; @@ -1095,13 +1096,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, cfg->tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL, true); /* - * When machine check was for corrected handler don't touch, - * unless we're panicing. + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicing. */ - if (severity == MCE_KEEP_SEVERITY && !no_way_out) + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; __set_bit(i, toclear); if (severity == MCE_NO_SEVERITY) { diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 51b7e3a36e3..c2359a1ea6b 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -32,9 +32,6 @@ #define R4(x) (((x) >> 4) & 0xf) #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") -#define MCI_STATUS_DEFERRED BIT_64(44) -#define MCI_STATUS_POISON BIT_64(43) - extern const char * const pp_msgs[]; enum tt_ids { -- cgit v1.2.3-70-g09d2 From fa92c58694268a7e9f7fa2c6881c1482221c2788 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Tue, 18 Nov 2014 10:09:20 +0800 Subject: x86, mce: Support memory error recovery for both UCNA and Deferred error in machine_check_poll Uncorrected no action required (UCNA) - is a uncorrected recoverable machine check error that is not signaled via a machine check exception and, instead, is reported to system software as a corrected machine check error. UCNA errors indicate that some data in the system is corrupted, but the data has not been consumed and the processor state is valid and you may continue execution on this processor. UCNA errors require no action from system software to continue execution. Note that UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. -- Intel SDM Volume 3B Deferred errors are errors that cannot be corrected by hardware, but do not cause an immediate interruption in program flow, loss of data integrity, or corruption of processor state. These errors indicate that data has been corrupted but not consumed. Hardware writes information to the status and address registers in the corresponding bank that identifies the source of the error if deferred errors are enabled for logging. Deferred errors are not reported via machine check exceptions; they can be seen by polling the MCi_STATUS registers. -- AMD64 APM Volume 2 Above two items, both UCNA and Deferred errors belong to detected errors, but they can't be corrected by hardware, and this is very similar to Software Recoverable Action Optional (SRAO) errors. Therefore, we can take some actions that have been used for handling SRAO errors to handle UCNA and Deferred errors. Acked-by: Borislav Petkov Signed-off-by: Chen Yucong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 453e9bf9096..cfb16f631d5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i) } } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* + * coming soon + */ + return false; + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; + int severity; int i; this_cpu_inc(mce_poll_count); @@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; + + severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + + /* + * In the cases where we don't have a valid address after all, + * do not add it into the ring buffer. + */ + if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { + if (m.status & MCI_STATUS_ADDRV) { + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_schedule_work(); + } + } + /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. -- cgit v1.2.3-70-g09d2 From c7c9b3929b6a57ad47ab4021c77e46f7ff21c007 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Dec 2014 22:36:45 +0100 Subject: x86/mce: Spell "panicked" correctly We need the additional "k" to make it a hard-c: https://en.wiktionary.org/wiki/panicked Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1417642605-15730-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cfb16f631d5..d2c611699cd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -292,10 +292,10 @@ static void print_mce(struct mce *m) #define PANIC_TIMEOUT 5 /* 5 seconds */ -static atomic_t mce_paniced; +static atomic_t mce_panicked; static int fake_panic; -static atomic_t mce_fake_paniced; +static atomic_t mce_fake_panicked; /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) @@ -319,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_inc_return(&mce_paniced) > 1) + if (atomic_inc_return(&mce_panicked) > 1) wait_for_panic(); barrier(); @@ -327,7 +327,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) console_verbose(); } else { /* Don't log too much for fake panic */ - if (atomic_inc_return(&mce_fake_paniced) > 1) + if (atomic_inc_return(&mce_fake_panicked) > 1) return; } /* First print corrected ones that are still unlogged */ @@ -744,7 +744,7 @@ static int mce_timed_out(u64 *t) * might have been modified by someone else. */ rmb(); - if (atomic_read(&mce_paniced)) + if (atomic_read(&mce_panicked)) wait_for_panic(); if (!mca_cfg.monarch_timeout) goto out; @@ -2568,7 +2568,7 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { cpu_missing = 0; - atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); -- cgit v1.2.3-70-g09d2