From 27f6c573e0f77f7d1cc907c1494c99a61e48b7d8 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Thu, 27 Mar 2014 21:24:36 -0400 Subject: x86, CMCI: Add proper detection of end of CMCI storms When CMCI storm persists for a long time(at least beyond predefined threshold. It's 30 seconds for now), we can watch CMCI storm is detected immediately after it subsides. ... Dec 10 22:04:29 kernel: CMCI storm detected: switching to poll mode Dec 10 22:04:59 kernel: CMCI storm subsided: switching to interrupt mode Dec 10 22:04:59 kernel: CMCI storm detected: switching to poll mode Dec 10 22:05:29 kernel: CMCI storm subsided: switching to interrupt mode ... The problem is that our logic that determines that the storm has ended is incorrect. We announce the end, re-enable interrupts and realize that the storm is still going on, so we switch back to polling mode. Rinse, repeat. When a storm happens we disable signaling of errors via CMCI and begin polling machine check banks instead. If we find any logged errors, then we need to set a per-cpu flag so that our per-cpu tests that check whether the storm is ongoing will see that errors are still being logged independently of whether mce_notify_irq() says that the error has been fully processed. cmci_clear() is not the right tool to disable a bank. It disables the interrupt for the bank as desired, but it also clears the bit for this bank in "mce_banks_owned" so we will skip the bank when polling (so we fail to see that the storm continues because we stop looking). New cmci_storm_disable_banks() just disables the interrupt while allowing polling to continue. Reported-by: William Dauchy Signed-off-by: Chen, Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 18 +++++++++++++++++- arch/x86/kernel/cpu/mcheck/mce_intel.c | 19 ++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4d5419b249d..78c92125db8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -595,6 +598,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; + unsigned long *v; this_cpu_inc(mce_poll_count); @@ -614,6 +618,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + v = &get_cpu_var(mce_polled_error); + set_bit(0, v); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -1278,10 +1284,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); @@ -1296,7 +1310,9 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) { + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { iv = max(iv / 2, (unsigned long) HZ/100); } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index fb6156fee6f..3bdb95ae8c4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } } +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static bool cmci_storm_detect(void) { unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -158,7 +175,7 @@ static bool cmci_storm_detect(void) if (cnt <= CMCI_STORM_THRESHOLD) return false; - cmci_clear(); + cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); mce_timer_kick(CMCI_POLL_INTERVAL); -- cgit v1.2.3-70-g09d2 From 023de4a09f571fad0af9691e4e437e14b68f05fb Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 1 Apr 2014 13:30:21 +0100 Subject: x86/apic: Reinstate error IRQ Pentium erratum 3AP workaround A change introduced with commit 60283df7ac26a4fe2d56631ca2946e04725e7eaf ("x86/apic: Read Error Status Register correctly") removed a read from the APIC ESR register made before writing to same required to retrieve the correct error status on Pentium systems affected by the 3AP erratum[1]: "3AP. Writes to Error Register Clears Register PROBLEM: The APIC Error register is intended to only be read. If there is a write to this register the data in the APIC Error register will be cleared and lost. IMPLICATION: There is a possibility of clearing the Error register status since the write to the register is not specifically blocked. WORKAROUND: Writes should not occur to the Pentium processor APIC Error register. STATUS: For the steppings affected see the Summary Table of Changes at the beginning of this section." The steppings affected are actually: B1, B3 and B5. To avoid this information loss this change avoids the write to ESR on all Pentium systems where it is actually never needed; in Pentium processor documentation ESR was noted read-only and the write only required for future architectural compatibility[2]. The approach taken is the same as in lapic_setup_esr(). References: [1] "Pentium Processor Family Developer's Manual", Intel Corporation, 1997, order number 241428-005, Appendix A "Errata and S-Specs for the Pentium Processor Family", p. A-92, [2] "Pentium Processor Family Developer's Manual, Volume 3: Architecture and Programming Manual", Intel Corporation, 1995, order number 241430-004, Section 19.3.3. "Error Handling In APIC", p. 19-33. Signed-off-by: Maciej W. Rozycki Cc: Richard Weinberger Link: http://lkml.kernel.org/r/alpine.LFD.2.11.1404011300010.27402@eddie.linux-mips.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 53e20531470..005ed3fb039 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1996,7 +1996,8 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) }; /* First tickle the hardware, only then report what went on. -- REW */ - apic_write(APIC_ESR, 0); + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); -- cgit v1.2.3-70-g09d2 From f704a7d7f1d815621cb4c47f7a94787e1bd7c27c Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Tue, 1 Apr 2014 23:51:42 -0700 Subject: x86/platform/hyperv: Handle VMBUS driver being a module Hyper-V VMBUS driver can be a module; handle this case correctly. Please apply. Signed-off-by: K. Y. Srinivasan Cc: olaf@aepfle.de Cc: apw@canonical.com Cc: jasowang@redhat.com Link: http://lkml.kernel.org/r/1396421502-23222-1-git-send-email-kys@microsoft.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 42805fac009..283a76a9cc4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -125,7 +125,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_printf(p, " Machine check polls\n"); #endif -#if defined(CONFIG_HYPERV) || defined(CONFIG_XEN) +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); -- cgit v1.2.3-70-g09d2 From b3b42ac2cbae1f3cecbb6229964a4d48af31d382 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 16 Mar 2014 15:31:54 -0700 Subject: x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels The IRET instruction, when returning to a 16-bit segment, only restores the bottom 16 bits of the user space stack pointer. We have a software workaround for that ("espfix") for the 32-bit kernel, but it relies on a nonzero stack segment base which is not available in 32-bit mode. Since 16-bit support is somewhat crippled anyway on a 64-bit kernel (no V86 mode), and most (if not quite all) 64-bit processors support virtualization for the users who really need it, simply reject attempts at creating a 16-bit segment when running on top of a 64-bit kernel. Cc: Linus Torvalds Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-kicdm89kzw9lldryb1br9od0@git.kernel.org Cc: --- arch/x86/kernel/ldt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc98739892..af1d14a9ebd 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,6 +229,17 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } + /* + * On x86-64 we do not support 16-bit segments due to + * IRET leaking the high bits of the kernel stack address. + */ +#ifdef CONFIG_X86_64 + if (!ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } +#endif + fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; -- cgit v1.2.3-70-g09d2