diff options
Diffstat (limited to 'drivers/edac/edac_mce_amd.c')
-rw-r--r-- | drivers/edac/edac_mce_amd.c | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 918567e8cfd..444c2cc4472 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,6 +1,31 @@ #include <linux/module.h> #include "edac_mce_amd.h" +static bool report_gart_errors; +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); + +void amd_report_gart_errors(bool v) +{ + report_gart_errors = v; +} +EXPORT_SYMBOL_GPL(amd_report_gart_errors); + +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + nb_bus_decoder = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + if (nb_bus_decoder) { + WARN_ON(nb_bus_decoder != f); + + nb_bus_decoder = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + /* * string representation for the different MCA reported error types, see F3x48 * or MSR0000_0411. @@ -102,3 +127,93 @@ const char *ext_msgs[] = { "Probe Filter error" /* 1_1111b */ }; EXPORT_SYMBOL_GPL(ext_msgs); + +void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) +{ + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); + + if (!handle_errors) + return; + + pr_emerg(" Northbridge Error, node %d", node_id); + + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } + + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { + /* + * GART errors are intended to help graphics driver developers + * to detect bad GART PTEs. It is recommended by AMD to disable + * GART table walk error reporting by default[1] (currently + * being disabled in mce_cpu_quirks()) and according to the + * comment in mce_cpu_quirks(), such GART errors can be + * incorrectly triggered. We may see these errors anyway and + * unless requested by the user, they won't be reported. + * + * [1] section 13.10.1 on BIOS and Kernel Developers Guide for + * AMD NPT family 0Fh processors + */ + if (!report_gart_errors) + return; + + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); + } else if (MEM_ERROR(ec)) { + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); + } else if (BUS_ERROR(ec)) { + pr_emerg(" Bus (Link/DRAM) error\n"); + if (nb_bus_decoder) + nb_bus_decoder(node_id, regs, ecc); + } else { + /* shouldn't reach here! */ + pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); + } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +void decode_mce(struct mce *m) +{ + struct err_regs regs; + int node; + + if (m->bank != 4) + return; + + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = topology_cpu_node_id(m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); +} |