summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2010-08-15 01:06:31 -0400
committerLen Brown <len.brown@intel.com>2010-08-15 01:06:31 -0400
commit95ee46aa8698f2000647dfb362400fadbb5807cf (patch)
treee5a05c7297f997e191c73091934e42e3195c0e40 /arch/x86/kernel/cpu/mcheck
parentcfa806f059801dbe7e435745eb2e187c8bfe1e7f (diff)
parent92fa5bd9a946b6e7aab6764e7312e4e3d9bed295 (diff)
Merge branch 'linus' into release
Conflicts: drivers/acpi/debug.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c206
3 files changed, 184 insertions, 66 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..ed41562909f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
static DEFINE_MUTEX(mce_read_mutex);
#define rcu_dereference_check_mce(p) \
- rcu_dereference_check((p), \
+ rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
lockdep_is_held(&mce_read_mutex))
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
static int default_decode_mce(struct notifier_block *nb, unsigned long val,
void *data)
{
- pr_emerg("No human readable MCE decoding support on this CPU type.\n");
- pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+ pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
+ pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
return NOTIFY_STOP;
}
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
- pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
- pr_emerg("RIP%s %02x:<%016Lx> ",
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
pr_cont("\n");
}
- pr_emerg("TSC %llx ", m->tsc);
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
if (m->addr)
pr_cont("ADDR %llx ", m->addr);
if (m->misc)
pr_cont("MISC %llx ", m->misc);
pr_cont("\n");
- pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
/*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
-static void print_mce_head(void)
-{
- pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
- pr_emerg("This is not a software problem!\n");
-}
-
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
if (atomic_inc_return(&mce_fake_paniced) > 1)
return;
}
- print_mce_head();
/* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
apei_err = apei_write_mce(final);
}
if (cpu_missing)
- printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
- print_mce_tail();
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
- printk(KERN_EMERG "Machine check: %s\n", exp);
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mce_panic_timeout;
panic(msg);
} else
- printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
}
/* Support code for software error injection */
@@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
add_taint(TAINT_MACHINE_CHECK);
}
@@ -1220,7 +1209,7 @@ int mce_notify_irq(void)
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
+ pr_info(HW_ERR "Machine check events logged\n");
return 1;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920..6fcd0936194 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (test_and_clear_bit(i, owned) && !boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val |= CMCI_EN | CMCI_THRESHOLD;
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (!test_and_set_bit(i, owned) && !boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf971..c2a8b26d4fe 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
/*
- * Current thermal throttling state:
+ * Current thermal event state:
*/
-struct thermal_state {
- bool is_throttled;
-
+struct _thermal_state {
+ bool new_event;
+ int event;
u64 next_check;
- unsigned long throttle_count;
- unsigned long last_throttle_count;
+ unsigned long count;
+ unsigned long last_count;
+};
+
+struct thermal_state {
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
};
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+ static SYSDEV_ATTR(_name, 0444, \
+ therm_throt_sysdev_show_##_name, \
+ NULL) \
-#define define_therm_throt_sysdev_show_func(name) \
+#define define_therm_throt_sysdev_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##name( \
+static ssize_t therm_throt_sysdev_show_##event##_##name( \
struct sys_device *dev, \
struct sysdev_attribute *attr, \
char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
+ if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).name); \
- else \
+ per_cpu(thermal_state, cpu).event.name); \
+ } else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
+
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_throttle_count.attr,
+ &attr_core_throttle_count.attr,
NULL
};
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-static int therm_throt_process(bool is_throttled)
+static int therm_throt_process(bool new_event, int event, int level)
{
- struct thermal_state *state;
- unsigned int this_cpu;
- bool was_throttled;
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- this_cpu = smp_processor_id();
now = get_jiffies_64();
- state = &per_cpu(thermal_state, this_cpu);
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return 0;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return 0;
+ } else
+ return 0;
- was_throttled = state->is_throttled;
- state->is_throttled = is_throttled;
+ old_event = state->new_event;
+ state->new_event = new_event;
- if (is_throttled)
- state->throttle_count++;
+ if (new_event)
+ state->count++;
if (time_before64(now, state->next_check) &&
- state->throttle_count != state->last_throttle_count)
+ state->count != state->last_count)
return 0;
state->next_check = now + CHECK_INTERVAL;
- state->last_throttle_count = state->throttle_count;
+ state->last_count = state->count;
/* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ else
+ printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
add_taint(TAINT_MACHINE_CHECK);
return 1;
}
- if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ else
+ printk(KERN_INFO "CPU%d: %s power limit normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PTS))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_throttle_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_power_limit_count.attr,
+ thermal_attr_group.name);
+
+ return err;
}
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED (0)
+#define CORE_POWER_LIMIT ((__u64)1 << 62)
+#define PACKAGE_THROTTLED ((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
+
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
- mce_log_therm_throt_event(msr_val);
+
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+ | msr_val);
+ }
}
static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
smp_thermal_vector = intel_thermal_interrupt;