diff options
author | Anton Blanchard <anton@samba.org> | 2011-01-11 19:49:19 +0000 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2011-01-21 14:08:38 +1100 |
commit | d47d1d8af52e37bcf9059dd86878474e5ccc9c2a (patch) | |
tree | c8af975e293d17c96f06913db78cac6041e01a47 /arch | |
parent | e49b1fae0ba4d06b29bd753a961abb447566bf4a (diff) |
powerpc: Rework pseries machine check handler
Rework pseries machine check handler:
- If MSR_RI isn't set, we cannot recover even if the machine check was fully
recovered
- Rename nonfatal to recovered
- Handle RTAS_DISP_LIMITED_RECOVERY
- Use BUS_MCEERR_AR instead of BUS_ADRERR
- Don't check all the RTAS error log fields when receiving a synchronous
machine check. Recent versions of the pseries firmware do not fill them
in during a machine check and instead send a follow up error log with
the detailed information. If we see a synchronous machine check, and we
came from userspace then kill the task.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/platforms/pseries/ras.c | 48 |
1 files changed, 30 insertions, 18 deletions
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 048e25711c6..d194150cf34 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -259,31 +259,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs) * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ -static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err) +static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) { - int nonfatal = 0; + int recovered = 0; - if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + + } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { /* Platform corrected itself */ - nonfatal = 1; - } else if ((regs->msr & MSR_RI) && - user_mode(regs) && - err->severity == RTAS_SEVERITY_ERROR_SYNC && - err->disposition == RTAS_DISP_NOT_RECOVERED && - err->target == RTAS_TARGET_MEMORY && - err->type == RTAS_TYPE_ECC_UNCORR && - !(current->pid == 0 || is_global_init(current))) { - /* Kill off a user process with an ECC error */ - printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", - current->pid); - /* XXX something better for ECC error? */ - _exception(SIGBUS, regs, BUS_ADRERR, regs->nip); - nonfatal = 1; + recovered = 1; + + } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + printk(KERN_ERR "MCE: limited recovery, system may " + "be degraded\n"); + recovered = 1; + + } else if (user_mode(regs) && !is_global_init(current) && + err->severity == RTAS_SEVERITY_ERROR_SYNC) { + + /* + * If we received a synchronous error when in userspace + * kill the task. Firmware may report details of the fail + * asynchronously, so we can't rely on the target and type + * fields being valid here. + */ + printk(KERN_ERR "MCE: uncorrectable error, killing task " + "%s:%d\n", current->comm, current->pid); + + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; } log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); - return nonfatal; + return recovered; } /* |