From e40b17208b6805be50ffe891878662b6076206b9 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:03 -0500
Subject: x86: Move notify_die from nmi.c to traps.c

In order to handle a new nmi_watchdog approach, I need to move
the notify_die() routine out of nmi_watchdog_tick() and into
default_do_nmi(). This lets me easily swap out the old
nmi_watchdog with the new one with just a config change.

The change probably makes sense from a high level perspective
because the nmi_watchdog shouldn't be handling notify_die
routines anyway.  However, this move does change the semantics a
little bit.  Instead of checking on every nmi interrupt if the
cpus are stuck, only check them on the nmi_watchdog interrupts.

 v2: Move notify_die call into #idef block

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/nmi.c | 7 -------
 arch/x86/kernel/traps.c    | 5 +++++
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396c..5d47682f580 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -400,13 +400,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 	int cpu = smp_processor_id();
 	int rc = 0;
 
-	/* check for other users first */
-	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-			== NOTIFY_STOP) {
-		rc = 1;
-		touched = 1;
-	}
-
 	sum = get_timer_irqs(cpu);
 
 	if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..51ef893ffa6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -400,7 +400,12 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
 								== NOTIFY_STOP)
 			return;
+
 #ifdef CONFIG_X86_LOCAL_APIC
+	        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+	        			                == NOTIFY_STOP)
+	                return;
+
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
-- 
cgit v1.2.3-70-g09d2


From 1fb9d6ad2766a1dd70d167552988375049a97f21 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:04 -0500
Subject: nmi_watchdog: Add new, generic implementation, using perf events

This is a new generic nmi_watchdog implementation using the perf
events infrastructure as suggested by Ingo.

The implementation is simple, just create an in-kernel perf
event and register an overflow handler to check for cpu lockups.

I created a generic implementation that lives in kernel/ and
the hardware specific part that for now lives in arch/x86.

This approach has a number of advantages:

 - It simplifies the x86 PMU implementation in the long run,
   in that it removes the hardcoded low-level PMU implementation
   that was the NMI watchdog before.

 - It allows new NMI watchdog features to be added in a central
   place.

 - It allows other architectures to enable the NMI watchdog,
   as long as they have perf events (that provide NMIs)
   implemented.

 - It also allows for more graceful co-existence of existing
   perf events apps and the NMI watchdog - before these changes
   the relationship was exclusive. (The NMI watchdog will 'spend'
   a perf event when enabled. In later iterations we might be
   able to piggyback from an existing NMI event without having
   to allocate a hardware event for the NMI watchdog - turning
   this into a no-hardware-cost feature.)

As for compatibility, we'll keep the old NMI watchdog code as
well until the new one can 100% replace it on all CPUs, old and
new alike.  That might take some time as the NMI watchdog has
been ported to many CPU models.

I have done light testing to make sure the framework works
correctly and it does.

 v2: Set the correct timeout values based on the old nmi
     watchdog

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/hw_nmi.c | 114 +++++++++++++++++++++++++
 kernel/nmi_watchdog.c         | 191 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 arch/x86/kernel/apic/hw_nmi.c
 create mode 100644 kernel/nmi_watchdog.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 00000000000..8c0e6a410d0
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,114 @@
+/*
+ *  HW NMI watchdog support
+ *
+ *  started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ *  Arch specific calls to support NMI watchdog
+ *
+ *  Bits copied from original nmi.c file
+ *
+ */
+
+#include <asm/apic.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <asm/mce.h>
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+
+/*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+static inline unsigned int get_timer_irqs(int cpu)
+{
+        return per_cpu(irq_stat, cpu).apic_timer_irqs +
+                per_cpu(irq_stat, cpu).irq0_irqs;
+}
+
+static inline int mce_in_progress(void)
+{
+#if defined(CONFIG_X86_MCE)
+        return atomic_read(&mce_entry) > 0;
+#endif
+        return 0;
+}
+
+int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
+{
+	unsigned int sum;
+	int cpu = smp_processor_id();
+
+	/* FIXME: cheap hack for this check, probably should get its own
+	 * die_notifier handler
+	 */
+	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+		show_regs(regs);
+		dump_stack();
+		spin_unlock(&lock);
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+	}
+
+	/* if we are doing an mce, just assume the cpu is not stuck */
+        /* Could check oops_in_progress here too, but it's safer not to */
+        if (mce_in_progress())
+                return 0;
+
+	/* We determine if the cpu is stuck by checking whether any
+	 * interrupts have happened since we last checked.  Of course
+	 * an nmi storm could create false positives, but the higher
+	 * level logic should account for that
+	 */
+	sum = get_timer_irqs(cpu);
+	if (__get_cpu_var(last_irq_sum) == sum) {
+		return 1;
+	} else {
+		__get_cpu_var(last_irq_sum) = sum;
+		return 0;
+	}
+}
+
+void arch_trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+
+	printk(KERN_INFO "sending NMI to all CPUs:\n");
+	apic->send_IPI_all(NMI_VECTOR);
+
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpumask_empty(to_cpumask(backtrace_mask)))
+			break;
+		mdelay(1);
+	}
+}
+
+/* STUB calls to mimic old nmi_watchdog behaviour */
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
+int nmi_watchdog_enabled;
+int unknown_nmi_panic;
+void cpu_nmi_set_wd_enabled(void) { return; }
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+void stop_apic_nmi_watchdog(void *unused) { return; }
+void setup_apic_nmi_watchdog(void *unused) { return; }
+int __init check_nmi_watchdog(void) { return 0; }
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
new file mode 100644
index 00000000000..36817b214d6
--- /dev/null
+++ b/kernel/nmi_watchdog.c
@@ -0,0 +1,191 @@
+/*
+ * Detect Hard Lockups using the NMI
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev);
+static DEFINE_PER_CPU(int, nmi_watchdog_touch);
+static DEFINE_PER_CPU(long, alert_counter);
+
+void touch_nmi_watchdog(void)
+{
+	__raw_get_cpu_var(nmi_watchdog_touch) = 1;
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+void touch_all_nmi_watchdog(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		per_cpu(nmi_watchdog_touch, cpu) = 1;
+	touch_softlockup_watchdog();
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int cpu;
+
+	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL)
+		nmi_watchdog_enabled = 0;
+	else
+		nmi_watchdog_enabled = 1;
+
+	touch_all_nmi_watchdog();
+	proc_dointvec(table, write, buffer, length, ppos);
+	if (nmi_watchdog_enabled)
+		for_each_online_cpu(cpu)
+			perf_event_enable(per_cpu(nmi_watchdog_ev, cpu));
+	else
+		for_each_online_cpu(cpu)
+			perf_event_disable(per_cpu(nmi_watchdog_ev, cpu));
+	return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+
+struct perf_event_attr wd_attr = {
+	.type = PERF_TYPE_HARDWARE,
+	.config = PERF_COUNT_HW_CPU_CYCLES,
+	.size = sizeof(struct perf_event_attr),
+	.pinned = 1,
+	.disabled = 1,
+};
+
+static int panic_on_timeout;
+
+void wd_overflow(struct perf_event *event, int nmi,
+		 struct perf_sample_data *data,
+		 struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+	int touched = 0;
+
+	if (__get_cpu_var(nmi_watchdog_touch)) {
+		per_cpu(nmi_watchdog_touch, cpu) = 0;
+		touched = 1;
+	}
+
+	/* check to see if the cpu is doing anything */
+	if (!touched && hw_nmi_is_cpu_stuck(regs)) {
+		/*
+		 * Ayiee, looks like this CPU is stuck ...
+		 * wait a few IRQs (5 seconds) before doing the oops ...
+		 */
+		per_cpu(alert_counter,cpu) += 1;
+		if (per_cpu(alert_counter,cpu) == 5) {
+			/*
+			 * die_nmi will return ONLY if NOTIFY_STOP happens..
+			 */
+			die_nmi("BUG: NMI Watchdog detected LOCKUP",
+				regs, panic_on_timeout);
+		}
+	} else {
+		per_cpu(alert_counter,cpu) = 0;
+	}
+
+	return;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct perf_event *event;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		per_cpu(nmi_watchdog_touch, hotcpu) = 0;
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		/* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */
+		wd_attr.sample_period = cpu_khz * 1000;
+		event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow);
+		if (IS_ERR(event)) {
+			printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event);
+			return NOTIFY_BAD;
+		}
+		per_cpu(nmi_watchdog_ev, hotcpu) = event;
+		perf_event_enable(per_cpu(nmi_watchdog_ev, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		perf_event_disable(per_cpu(nmi_watchdog_ev, hotcpu));
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		event = per_cpu(nmi_watchdog_ev, hotcpu);
+		per_cpu(nmi_watchdog_ev, hotcpu) = NULL;
+		perf_event_release_kernel(event);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+static int __initdata nonmi_watchdog;
+
+static int __init nonmi_watchdog_setup(char *str)
+{
+	nonmi_watchdog = 1;
+	return 1;
+}
+__setup("nonmi_watchdog", nonmi_watchdog_setup);
+
+static int __init spawn_nmi_watchdog_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	if (nonmi_watchdog)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	return 0;
+}
+early_initcall(spawn_nmi_watchdog_task);
-- 
cgit v1.2.3-70-g09d2


From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:05 -0500
Subject: nmi_watchdog: Config option to enable new nmi_watchdog

These are the bits that enable the new nmi_watchdog and safely
isolate the old nmi_watchdog.  Only one or the other can run,
not both at the same time.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/Makefile |  7 ++++++-
 arch/x86/kernel/traps.c       |  2 ++
 include/linux/nmi.h           |  4 ++++
 kernel/Makefile               |  1 +
 lib/Kconfig.debug             | 13 +++++++++++++
 5 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507..1a4512e48d2 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
-obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
+ifneq ($(CONFIG_NMI_WATCHDOG),y)
+obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
+endif
+obj-$(CONFIG_NMI_WATCHDOG)	+= hw_nmi.o
+
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 51ef893ffa6..973cbc4f044 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -406,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 	        			                == NOTIFY_STOP)
 	                return;
 
+#ifndef CONFIG_NMI_WATCHDOG
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
@@ -413,6 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, cpu))
+#endif /* !CONFIG_NMI_WATCHDOG */
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b752e807add..a42ff0bef70 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,4 +47,8 @@ static inline bool trigger_all_cpu_backtrace(void)
 }
 #endif
 
+#ifdef CONFIG_NMI_WATCHDOG
+int hw_nmi_is_cpu_stuck(struct pt_regs *);
+#endif
+
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f..8a5abe53eba 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 25c3ed594c5..f80b67e72aa 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -170,6 +170,19 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
+config NMI_WATCHDOG
+	bool "Detect Hard Lockups with an NMI Watchdog"
+	depends on DEBUG_KERNEL && PERF_EVENTS
+	default y
+	help
+	  Say Y here to enable the kernel to use the NMI as a watchdog
+	  to detect hard lockups.  This is useful when a cpu hangs for no
+	  reason but can still respond to NMIs.  A backtrace is displayed
+	  for reviewing and reporting.
+
+	  The overhead should be minimal, just an extra NMI every few
+	  seconds.
+
 config BOOTPARAM_SOFTLOCKUP_PANIC
 	bool "Panic (Reboot) On Soft Lockups"
 	depends on DETECT_SOFTLOCKUP
-- 
cgit v1.2.3-70-g09d2


From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 12 Feb 2010 17:19:19 -0500
Subject: nmi_watchdog: Compile and portability fixes

The original patch was x86_64 centric.  Changed the code to make
it less so.

ested by building and running on a powerpc.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/nmi.h    |  2 ++
 arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++-----
 include/linux/nmi.h           |  9 ++++++++
 kernel/nmi_watchdog.c         | 52 ++++++++++++++++++++++++++++++++++---------
 kernel/sysctl.c               | 15 ++++++++++++-
 5 files changed, 82 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f334..5b41b0feb6d 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
+#if !defined(CONFIG_NMI_WATCHDOG)
 extern int nmi_watchdog_enabled;
+#endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
 extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 8c0e6a410d0..312d772c5c3 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum);
  */
 static inline unsigned int get_timer_irqs(int cpu)
 {
-        return per_cpu(irq_stat, cpu).apic_timer_irqs +
-                per_cpu(irq_stat, cpu).irq0_irqs;
+	unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs;
+
+#if defined(CONFIG_X86_LOCAL_APIC)
+	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+
+        return irqs;
 }
 
 static inline int mce_in_progress(void)
@@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 }
 
+u64 hw_nmi_get_sample_period(void)
+{
+        return cpu_khz * 1000;
+}
+
 void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
@@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void)
 }
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
 unsigned int nmi_watchdog = NMI_NONE;
 EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
 EXPORT_SYMBOL(nmi_active);
-int nmi_watchdog_enabled;
 int unknown_nmi_panic;
 void cpu_nmi_set_wd_enabled(void) { return; }
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
 void stop_apic_nmi_watchdog(void *unused) { return; }
 void setup_apic_nmi_watchdog(void *unused) { return; }
 int __init check_nmi_watchdog(void) { return 0; }
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a42ff0bef70..794e7354c5b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,10 +20,14 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
+#ifndef CONFIG_NMI_WATCHDOG
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
 }
+#else
+extern void touch_nmi_watchdog(void);
+#endif
 static inline void acpi_nmi_disable(void) { }
 static inline void acpi_nmi_enable(void) { }
 #endif
@@ -49,6 +53,11 @@ static inline bool trigger_all_cpu_backtrace(void)
 
 #ifdef CONFIG_NMI_WATCHDOG
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
+u64 hw_nmi_get_sample_period(void);
+extern int nmi_watchdog_enabled;
+struct ctl_table;
+extern int proc_nmi_enabled(struct ctl_table *, int ,
+                        void __user *, size_t *, loff_t *);
 #endif
 
 #endif
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
index 36817b214d6..73c1954a97b 100644
--- a/kernel/nmi_watchdog.c
+++ b/kernel/nmi_watchdog.c
@@ -30,6 +30,8 @@ static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev);
 static DEFINE_PER_CPU(int, nmi_watchdog_touch);
 static DEFINE_PER_CPU(long, alert_counter);
 
+static int panic_on_timeout;
+
 void touch_nmi_watchdog(void)
 {
 	__raw_get_cpu_var(nmi_watchdog_touch) = 1;
@@ -46,19 +48,49 @@ void touch_all_nmi_watchdog(void)
 	touch_softlockup_watchdog();
 }
 
+static int __init setup_nmi_watchdog(char *str)
+{
+        if (!strncmp(str, "panic", 5)) {
+                panic_on_timeout = 1;
+                str = strchr(str, ',');
+                if (!str)
+                        return 1;
+                ++str;
+        }
+        return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
 #ifdef CONFIG_SYSCTL
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog
  */
+int nmi_watchdog_enabled;
+
 int proc_nmi_enabled(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int cpu;
 
-	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL)
+	if (!write) {
+		struct perf_event *event;
+		for_each_online_cpu(cpu) {
+			event = per_cpu(nmi_watchdog_ev, cpu);
+			if (event->state > PERF_EVENT_STATE_OFF) {
+				nmi_watchdog_enabled = 1;
+				break;
+			}
+		}
+		proc_dointvec(table, write, buffer, length, ppos);
+		return 0;
+	}
+
+	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) {
 		nmi_watchdog_enabled = 0;
-	else
-		nmi_watchdog_enabled = 1;
+		proc_dointvec(table, write, buffer, length, ppos);
+		printk("NMI watchdog failed configuration, can not be enabled\n");
+		return 0;
+	}
 
 	touch_all_nmi_watchdog();
 	proc_dointvec(table, write, buffer, length, ppos);
@@ -81,8 +113,6 @@ struct perf_event_attr wd_attr = {
 	.disabled = 1,
 };
 
-static int panic_on_timeout;
-
 void wd_overflow(struct perf_event *event, int nmi,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
@@ -103,11 +133,11 @@ void wd_overflow(struct perf_event *event, int nmi,
 		 */
 		per_cpu(alert_counter,cpu) += 1;
 		if (per_cpu(alert_counter,cpu) == 5) {
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi("BUG: NMI Watchdog detected LOCKUP",
-				regs, panic_on_timeout);
+			if (panic_on_timeout) {
+				panic("NMI Watchdog detected LOCKUP on cpu %d", cpu);
+			} else {
+				WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu);
+			}
 		}
 	} else {
 		per_cpu(alert_counter,cpu) = 0;
@@ -133,7 +163,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */
-		wd_attr.sample_period = cpu_khz * 1000;
+		wd_attr.sample_period = hw_nmi_get_sample_period();
 		event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow);
 		if (IS_ERR(event)) {
 			printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b244846..ac72c9e6bd9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -60,6 +60,10 @@
 #include <asm/io.h>
 #endif
 
+#ifdef CONFIG_NMI_WATCHDOG
+#include <linux/nmi.h>
+#endif
+
 
 #if defined(CONFIG_SYSCTL)
 
@@ -692,7 +696,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+#if defined(CONFIG_NMI_WATCHDOG)
+	{
+		.procname       = "nmi_watchdog",
+		.data           = &nmi_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = proc_nmi_enabled,
+	},
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
-- 
cgit v1.2.3-70-g09d2


From 2cc4452bc31fc1cde6f0b64a4eb13269f982787d Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 18 Feb 2010 21:56:52 -0500
Subject: nmi_watchdog: Fix undefined 'apic' build bug

Ingo provided me a config that fails to compile with:

  arch/x86/built-in.o: In function
  `arch_trigger_all_cpu_backtrace': (.text+0x17e78): undefined
  reference to `apic' make: *** [.tmp_vmlinux1] Error 1

I realized I changed the compile behaviour of the nmi code by
not wrapping it with CONFIG_LOCAL_APIC.  To fix this I add a
compile check for ARCH_HAS_NMI_WATCHDOG around
arch_trigger_all_cpu_backtrace.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: a.p.zijlstra@chello.nl
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266548212-24243-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/hw_nmi.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 312d772c5c3..0b4d205a6b8 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -92,6 +92,7 @@ u64 hw_nmi_get_sample_period(void)
         return cpu_khz * 1000;
 }
 
+#ifdef ARCH_HAS_NMI_WATCHDOG
 void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
@@ -108,6 +109,7 @@ void arch_trigger_all_cpu_backtrace(void)
 		mdelay(1);
 	}
 }
+#endif
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
 #if defined(CONFIG_X86_LOCAL_APIC)
-- 
cgit v1.2.3-70-g09d2


From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 22 Feb 2010 18:09:03 -0500
Subject: nmi_watchdog: Clean up various small details

Mostly copy/paste whitespace damage with a couple of nitpicks by
the checkpatch script. Fix the struct definition as requested by Ingo too.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/apic/hw_nmi.c |   14 +++++------
 arch/x86/kernel/traps.c       |    6 ++--
 include/linux/nmi.h           |    2 -
 kernel/nmi_watchdog.c         |   51 ++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)
---
 arch/x86/kernel/apic/hw_nmi.c | 14 ++++++------
 arch/x86/kernel/traps.c       |  6 ++---
 include/linux/nmi.h           |  2 +-
 kernel/nmi_watchdog.c         | 51 +++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 0b4d205a6b8..e8b78a0be5d 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu)
 	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
 #endif
 
-        return irqs;
+	return irqs;
 }
 
 static inline int mce_in_progress(void)
 {
 #if defined(CONFIG_X86_MCE)
-        return atomic_read(&mce_entry) > 0;
+	return atomic_read(&mce_entry) > 0;
 #endif
-        return 0;
+	return 0;
 }
 
 int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
@@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 
 	/* if we are doing an mce, just assume the cpu is not stuck */
-        /* Could check oops_in_progress here too, but it's safer not to */
-        if (mce_in_progress())
-                return 0;
+	/* Could check oops_in_progress here too, but it's safer not to */
+	if (mce_in_progress())
+		return 0;
 
 	/* We determine if the cpu is stuck by checking whether any
 	 * interrupts have happened since we last checked.  Of course
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-        return cpu_khz * 1000;
+	return cpu_khz * 1000;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 973cbc4f044..bdc7fab3ef3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 			return;
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-	        			                == NOTIFY_STOP)
-	                return;
+		if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
 
 #ifndef CONFIG_NMI_WATCHDOG
 		/*
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 794e7354c5b..22cc7960b64 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -57,7 +57,7 @@ u64 hw_nmi_get_sample_period(void);
 extern int nmi_watchdog_enabled;
 struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
-                        void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 #endif
 
 #endif
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
index 3c75cbf3acb..0a6f57f537a 100644
--- a/kernel/nmi_watchdog.c
+++ b/kernel/nmi_watchdog.c
@@ -50,31 +50,31 @@ void touch_all_nmi_watchdog(void)
 
 static int __init setup_nmi_watchdog(char *str)
 {
-        if (!strncmp(str, "panic", 5)) {
-                panic_on_timeout = 1;
-                str = strchr(str, ',');
-                if (!str)
-                        return 1;
-                ++str;
-        }
-        return 1;
+	if (!strncmp(str, "panic", 5)) {
+		panic_on_timeout = 1;
+		str = strchr(str, ',');
+		if (!str)
+			return 1;
+		++str;
+	}
+	return 1;
 }
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
 struct perf_event_attr wd_hw_attr = {
-	.type = PERF_TYPE_HARDWARE,
-	.config = PERF_COUNT_HW_CPU_CYCLES,
-	.size = sizeof(struct perf_event_attr),
-	.pinned = 1,
-	.disabled = 1,
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
 };
 
 struct perf_event_attr wd_sw_attr = {
-	.type = PERF_TYPE_SOFTWARE,
-	.config = PERF_COUNT_SW_CPU_CLOCK,
-	.size = sizeof(struct perf_event_attr),
-	.pinned = 1,
-	.disabled = 1,
+	.type		= PERF_TYPE_SOFTWARE,
+	.config		= PERF_COUNT_SW_CPU_CLOCK,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
 };
 
 void wd_overflow(struct perf_event *event, int nmi,
@@ -95,16 +95,15 @@ void wd_overflow(struct perf_event *event, int nmi,
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		per_cpu(alert_counter,cpu) += 1;
-		if (per_cpu(alert_counter,cpu) == 5) {
-			if (panic_on_timeout) {
+		per_cpu(alert_counter, cpu) += 1;
+		if (per_cpu(alert_counter, cpu) == 5) {
+			if (panic_on_timeout)
 				panic("NMI Watchdog detected LOCKUP on cpu %d", cpu);
-			} else {
+			else
 				WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu);
-			}
 		}
 	} else {
-		per_cpu(alert_counter,cpu) = 0;
+		per_cpu(alert_counter, cpu) = 0;
 	}
 
 	return;
@@ -126,7 +125,7 @@ static int enable_nmi_watchdog(int cpu)
 		event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow);
 		if (IS_ERR(event)) {
 			/* hardware doesn't exist or not supported, fallback to software events */
-			printk("nmi_watchdog: hardware not available, trying software events\n");
+			printk(KERN_INFO "nmi_watchdog: hardware not available, trying software events\n");
 			wd_attr = &wd_sw_attr;
 			wd_attr->sample_period = NSEC_PER_SEC;
 			event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow);
@@ -182,7 +181,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write,
 	if (nmi_watchdog_enabled) {
 		for_each_online_cpu(cpu)
 			if (enable_nmi_watchdog(cpu)) {
-				printk("NMI watchdog failed configuration, "
+				printk(KERN_ERR "NMI watchdog failed configuration, "
 					" can not be enabled\n");
 			}
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:44 -0400
Subject: lockup_detector: Combine nmi_watchdog and softlockup detector

The new nmi_watchdog (which uses the perf event subsystem) is very
similar in structure to the softlockup detector.  Using Ingo's
suggestion, I combined the two functionalities into one file:
kernel/watchdog.c.

Now both the nmi_watchdog (or hardlockup detector) and softlockup
detector sit on top of the perf event subsystem, which is run every
60 seconds or so to see if there are any lockups.

To detect hardlockups, cpus not responding to interrupts, I
implemented an hrtimer that runs 5 times for every perf event
overflow event.  If that stops counting on a cpu, then the cpu is
most likely in trouble.

To detect softlockups, tasks not yielding to the scheduler, I used the
previous kthread idea that now gets kicked every time the hrtimer fires.
If the kthread isn't being scheduled neither is anyone else and the
warning is printed to the console.

I tested this on x86_64 and both the softlockup and hardlockup paths
work.

V2:
- cleaned up the Kconfig and softlockup combination
- surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI
- seperated out the softlockup case from perf event subsystem
- re-arranged the enabling/disabling nmi watchdog from proc space
- added cpumasks for hardlockup failure cases
- removed fallback to soft events if no PMU exists for hard events

V3:
- comment cleanups
- drop support for older softlockup code
- per_cpu cleanups
- completely remove software clock base hardlockup detector
- use per_cpu masking on hard/soft lockup detection
- #ifdef cleanups
- rename config option NMI_WATCHDOG to LOCKUP_DETECTOR
- documentation additions

V4:
- documentation fixes
- convert per_cpu to __get_cpu_var
- powerpc compile fixes

V5:
- split apart warn flags for hard and soft lockups

TODO:
- figure out how to make an arch-agnostic clock2cycles call
  (if possible) to feed into perf events as a sample period

[fweisbec: merged conflict patch]

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/kernel-parameters.txt |   2 +
 arch/x86/include/asm/nmi.h          |   2 +-
 arch/x86/kernel/apic/Makefile       |   4 +-
 arch/x86/kernel/apic/hw_nmi.c       |   2 +-
 arch/x86/kernel/traps.c             |   4 +-
 include/linux/nmi.h                 |   8 +-
 include/linux/sched.h               |   6 +
 init/Kconfig                        |   5 +-
 kernel/Makefile                     |   3 +-
 kernel/sysctl.c                     |  21 +-
 kernel/watchdog.c                   | 592 ++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug                   |  30 +-
 12 files changed, 650 insertions(+), 29 deletions(-)
 create mode 100644 kernel/watchdog.c

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699..dfe8d1c226c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nousb		[USB] Disable the USB subsystem
 
+	nowatchdog	[KNL] Disable the lockup detector.
+
 	nowb		[ARM]
 
 	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 5b41b0feb6d..932f0f86b4b 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_NMI_WATCHDOG)
+#if !defined(CONFIG_LOCKUP_DETECTOR)
 extern int nmi_watchdog_enabled;
 #endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 1a4512e48d2..52f32e0ea19 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_NMI_WATCHDOG),y)
+ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_NMI_WATCHDOG)	+= hw_nmi.o
+obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index e8b78a0be5d..79425f96fce 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-	return cpu_khz * 1000;
+	return (u64)(cpu_khz) * 1000 * 60;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bdc7fab3ef3..bd347c2b34d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 							== NOTIFY_STOP)
 			return;
 
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
@@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_NMI_WATCHDOG */
+#endif /* !CONFIG_LOCKUP_DETECTOR */
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 22cc7960b64..abd48aacaf7 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
@@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void)
 }
 #endif
 
-#ifdef CONFIG_NMI_WATCHDOG
+#ifdef CONFIG_LOCKUP_DETECTOR
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(void);
-extern int nmi_watchdog_enabled;
+extern int watchdog_enabled;
 struct ctl_table;
-extern int proc_nmi_enabled(struct ctl_table *, int ,
+extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf..37efe8fa530 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_LOCKUP_DETECTOR
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
+#endif
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
diff --git a/init/Kconfig b/init/Kconfig
index c6c8903cb53..e44e25422f2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -944,8 +944,11 @@ config PERF_USE_VMALLOC
 
 config PERF_EVENTS_NMI
 	bool
+	depends on PERF_EVENTS
 	help
-	  Arch has support for nmi_watchdog
+	  System hardware can generate an NMI using the perf event
+	  subsystem.  Also has support for calculating CPU cycle events
+	  to determine how many clock cycles in a given period.
 
 menu "Kernel Performance Events And Counters"
 
diff --git a/kernel/Makefile b/kernel/Makefile
index d5c30060ac1..6adeafc3e25 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
-obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a38af430f0d..0f9adda85f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -74,7 +74,7 @@
 #include <scsi/sg.h>
 #endif
 
-#ifdef CONFIG_NMI_WATCHDOG
+#ifdef CONFIG_LOCKUP_DETECTOR
 #include <linux/nmi.h>
 #endif
 
@@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_NMI_WATCHDOG)
+#if defined(CONFIG_LOCKUP_DETECTOR)
 	{
-		.procname       = "nmi_watchdog",
-		.data           = &nmi_watchdog_enabled,
+		.procname       = "watchdog",
+		.data           = &watchdog_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = proc_nmi_enabled,
+		.proc_handler   = proc_dowatchdog_enabled,
+	},
+	{
+		.procname	= "watchdog_thresh",
+		.data		= &softlockup_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dowatchdog_thresh,
+		.extra1		= &neg_one,
+		.extra2		= &sixty,
 	},
 #endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 00000000000..6b7fad8497a
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,592 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+int watchdog_enabled;
+int __read_mostly softlockup_thresh = 60;
+
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+#ifdef CONFIG_PERF_EVENTS_NMI
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+
+static int __read_mostly did_panic;
+static int __initdata no_watchdog;
+
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_PERF_EVENTS_NMI
+static int hardlockup_panic;
+
+static int __init hardlockup_panic_setup(char *str)
+{
+	if (!strncmp(str, "panic", 5))
+		hardlockup_panic = 1;
+	return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+
+unsigned int __read_mostly softlockup_panic =
+			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
+/* deprecated */
+static int __init nosoftlockup_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+/*  */
+
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static unsigned long get_sample_period(void)
+{
+	/*
+	 * convert softlockup_thresh from seconds to ns
+	 * the divide by 5 is to give hrtimer 5 chances to
+	 * increment before the hardlockup detector generates
+	 * a warning
+	 */
+	return softlockup_thresh / 5 * NSEC_PER_SEC;
+}
+
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+}
+
+void touch_watchdog(void)
+{
+	__get_cpu_var(watchdog_touch_ts) = 0;
+}
+EXPORT_SYMBOL(touch_watchdog);
+
+void touch_all_watchdog(void)
+{
+	int cpu;
+
+	/*
+	 * this is done lockless
+	 * do we care if a 0 races with a timestamp?
+	 * all it means is the softlock check starts one cycle later
+	 */
+	for_each_online_cpu(cpu)
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+
+void touch_nmi_watchdog(void)
+{
+	touch_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+void touch_all_nmi_watchdog(void)
+{
+	touch_all_watchdog();
+}
+
+void touch_softlockup_watchdog(void)
+{
+	touch_watchdog();
+}
+
+void touch_all_softlockup_watchdogs(void)
+{
+	touch_all_watchdog();
+}
+
+void touch_softlockup_watchdog_sync(void)
+{
+	__raw_get_cpu_var(softlockup_touch_sync) = true;
+	__raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+
+void softlockup_tick(void)
+{
+}
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+/* watchdog detector functions */
+static int is_hardlockup(int cpu)
+{
+	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+	if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+		return 1;
+
+	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+	return 0;
+}
+#endif
+
+static int is_softlockup(unsigned long touch_ts, int cpu)
+{
+	unsigned long now = get_timestamp(cpu);
+
+	/* Warn about unreasonable delays: */
+	if (time_after(now, touch_ts + softlockup_thresh))
+		return now - touch_ts;
+
+	return 0;
+}
+
+static int
+watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = watchdog_panic,
+};
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
+/* Callback function for perf event subsystem */
+void watchdog_overflow_callback(struct perf_event *event, int nmi,
+		 struct perf_sample_data *data,
+		 struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_ts = per_cpu(watchdog_touch_ts, this_cpu);
+
+	if (touch_ts == 0) {
+		__touch_watchdog();
+		return;
+	}
+
+	/* check for a hardlockup
+	 * This is done by making sure our timer interrupt
+	 * is incrementing.  The timer interrupt should have
+	 * fired multiple times before we overflow'd.  If it hasn't
+	 * then this is a good indication the cpu is stuck
+	 */
+	if (is_hardlockup(this_cpu)) {
+		/* only print hardlockups once */
+		if (__get_cpu_var(hard_watchdog_warn) == true)
+			return;
+
+		if (hardlockup_panic)
+			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		else
+			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+
+		__get_cpu_var(hard_watchdog_warn) = true;
+		return;
+	}
+
+	__get_cpu_var(hard_watchdog_warn) = false;
+	return;
+}
+static void watchdog_interrupt_count(void)
+{
+	__get_cpu_var(hrtimer_interrupts)++;
+}
+#else
+static inline void watchdog_interrupt_count(void) { return; }
+#endif /* CONFIG_PERF_EVENTS_NMI */
+
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+	struct pt_regs *regs = get_irq_regs();
+	int duration;
+
+	/* kick the hardlockup detector */
+	watchdog_interrupt_count();
+
+	/* kick the softlockup detector */
+	wake_up_process(__get_cpu_var(softlockup_watchdog));
+
+	/* .. and repeat */
+	hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+
+	if (touch_ts == 0) {
+		if (unlikely(per_cpu(softlockup_touch_sync, this_cpu))) {
+			/*
+			 * If the time stamp was touched atomically
+			 * make sure the scheduler tick is up to date.
+			 */
+			per_cpu(softlockup_touch_sync, this_cpu) = false;
+			sched_clock_tick();
+		}
+		__touch_watchdog();
+		return HRTIMER_RESTART;
+	}
+
+	/* check for a softlockup
+	 * This is done by making sure a high priority task is
+	 * being scheduled.  The task touches the watchdog to
+	 * indicate it is getting cpu time.  If it hasn't then
+	 * this is a good indication some task is hogging the cpu
+	 */
+	duration = is_softlockup(touch_ts, this_cpu);
+	if (unlikely(duration)) {
+		/* only warn once */
+		if (__get_cpu_var(soft_watchdog_warn) == true)
+			return HRTIMER_RESTART;
+
+		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+			this_cpu, duration,
+			current->comm, task_pid_nr(current));
+		print_modules();
+		print_irqtrace_events(current);
+		if (regs)
+			show_regs(regs);
+		else
+			dump_stack();
+
+		if (softlockup_panic)
+			panic("softlockup: hung tasks");
+		__get_cpu_var(soft_watchdog_warn) = true;
+	} else
+		__get_cpu_var(soft_watchdog_warn) = false;
+
+	return HRTIMER_RESTART;
+}
+
+
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *__bind_cpu)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, (unsigned long)__bind_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* initialize timestamp */
+	__touch_watchdog();
+
+	/* kick off the timer for the hardlockup detector */
+	/* done here because hrtimer_start can only pin to smp_processor_id() */
+	hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+		      HRTIMER_MODE_REL_PINNED);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	/*
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 60 seconds then the
+	 * debug-printout triggers in softlockup_tick().
+	 */
+	while (!kthread_should_stop()) {
+		__touch_watchdog();
+		schedule();
+
+		if (kthread_should_stop())
+			break;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+static int watchdog_nmi_enable(int cpu)
+{
+	struct perf_event_attr *wd_attr;
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	/* is it already setup and enabled? */
+	if (event && event->state > PERF_EVENT_STATE_OFF)
+		goto out;
+
+	/* it is setup but not enabled */
+	if (event != NULL)
+		goto out_enable;
+
+	/* Try to register using hardware perf events */
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period();
+	event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+	if (!IS_ERR(event)) {
+		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+		goto out_save;
+	}
+
+	printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+	return -1;
+
+	/* success path */
+out_save:
+	per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+	perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+	return 0;
+}
+
+static void watchdog_nmi_disable(int cpu)
+{
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	if (event) {
+		perf_event_disable(event);
+		per_cpu(watchdog_ev, cpu) = NULL;
+
+		/* should be in cleanup, but blocks oprofile */
+		perf_event_release_kernel(event);
+	}
+	return;
+}
+#else
+static int watchdog_nmi_enable(int cpu) { return 0; }
+static void watchdog_nmi_disable(int cpu) { return; }
+#endif /* CONFIG_PERF_EVENTS_NMI */
+
+/* prepare/enable/disable routines */
+static int watchdog_prepare_cpu(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	WARN_ON(per_cpu(softlockup_watchdog, cpu));
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+
+	return 0;
+}
+
+static int watchdog_enable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+
+	/* enable the perf event */
+	if (watchdog_nmi_enable(cpu) != 0)
+		return -1;
+
+	/* create the watchdog thread */
+	if (!p) {
+		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+		if (IS_ERR(p)) {
+			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+			return -1;
+		}
+		kthread_bind(p, cpu);
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+		per_cpu(softlockup_watchdog, cpu) = p;
+		wake_up_process(p);
+	}
+
+	return 0;
+}
+
+static void watchdog_disable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	/*
+	 * cancel the timer first to stop incrementing the stats
+	 * and waking up the kthread
+	 */
+	hrtimer_cancel(hrtimer);
+
+	/* disable the perf event */
+	watchdog_nmi_disable(cpu);
+
+	/* stop the watchdog thread */
+	if (p) {
+		per_cpu(softlockup_watchdog, cpu) = NULL;
+		kthread_stop(p);
+	}
+
+	/* if any cpu succeeds, watchdog is considered enabled for the system */
+	watchdog_enabled = 1;
+}
+
+static void watchdog_enable_all_cpus(void)
+{
+	int cpu;
+	int result;
+
+	for_each_online_cpu(cpu)
+		result += watchdog_enable(cpu);
+
+	if (result)
+		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		watchdog_disable(cpu);
+
+	/* if all watchdogs are disabled, then they are disabled for the system */
+	watchdog_enabled = 0;
+}
+
+
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+
+int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, buffer, length, ppos);
+
+	if (watchdog_enabled)
+		watchdog_enable_all_cpus();
+	else
+		watchdog_disable_all_cpus();
+	return 0;
+}
+
+int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+/* stub functions */
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos);
+}
+/* end of stub functions */
+#endif /* CONFIG_SYSCTL */
+
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (watchdog_prepare_cpu(hotcpu))
+			return NOTIFY_BAD;
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		if (watchdog_enable(hotcpu))
+			return NOTIFY_BAD;
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+static int __init spawn_watchdog_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	if (no_watchdog)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	WARN_ON(err == NOTIFY_BAD);
+
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
+}
+early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 220ae6063b6..49e285dcaf5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -153,7 +153,7 @@ config DEBUG_SHIRQ
 	  points; some don't and need to be caught.
 
 config DETECT_SOFTLOCKUP
-	bool "Detect Soft Lockups"
+	bool
 	depends on DEBUG_KERNEL && !S390
 	default y
 	help
@@ -171,17 +171,27 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
-config NMI_WATCHDOG
-	bool "Detect Hard Lockups with an NMI Watchdog"
-	depends on DEBUG_KERNEL && PERF_EVENTS && PERF_EVENTS_NMI
+config LOCKUP_DETECTOR
+	bool "Detect Hard and Soft Lockups"
+	depends on DEBUG_KERNEL
+	default DETECT_SOFTLOCKUP
 	help
-	  Say Y here to enable the kernel to use the NMI as a watchdog
-	  to detect hard lockups.  This is useful when a cpu hangs for no
-	  reason but can still respond to NMIs.  A backtrace is displayed
-	  for reviewing and reporting.
+	  Say Y here to enable the kernel to act as a watchdog to detect
+	  hard and soft lockups.
+
+	  Softlockups are bugs that cause the kernel to loop in kernel
+	  mode for more than 60 seconds, without giving other tasks a
+	  chance to run.  The current stack trace is displayed upon
+	  detection and the system will stay locked up.
+
+	  Hardlockups are bugs that cause the CPU to loop in kernel mode
+	  for more than 60 seconds, without letting other interrupts have a
+	  chance to run.  The current stack trace is displayed upon detection
+	  and the system will stay locked up.
 
-	  The overhead should be minimal, just an extra NMI every few
-	  seconds.
+	  The overhead should be minimal.  A periodic hrtimer runs to
+	  generate interrupts and kick the watchdog task every 10-12 seconds.
+	  An NMI is generated every 60 seconds or so to check for hardlockups.
 
 config BOOTPARAM_SOFTLOCKUP_PANIC
 	bool "Panic (Reboot) On Soft Lockups"
-- 
cgit v1.2.3-70-g09d2


From 7cbb7e7fa46f6e5229438ac9e4a5c72ec0d53e0b Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:48 -0400
Subject: x86: Move trigger_all_cpu_backtrace to its own die_notifier

As part of the transition of the nmi watchdog to something more
generic, the trigger_all_cpu_backtrace code is getting left behind.
Put it in its own die_notifier so it can still be used.

V2:
- use arch_spin_locks

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-6-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/hw_nmi.c | 65 +++++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79425f96fce..8c3edfb89c2 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,6 +17,10 @@
 #include <linux/cpumask.h>
 #include <linux/kernel_stat.h>
 #include <asm/mce.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+
 
 #include <linux/nmi.h>
 #include <linux/module.h>
@@ -54,20 +58,6 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	unsigned int sum;
 	int cpu = smp_processor_id();
 
-	/* FIXME: cheap hack for this check, probably should get its own
-	 * die_notifier handler
-	 */
-	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
-
-		spin_lock(&lock);
-		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-		show_regs(regs);
-		dump_stack();
-		spin_unlock(&lock);
-		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-	}
-
 	/* if we are doing an mce, just assume the cpu is not stuck */
 	/* Could check oops_in_progress here too, but it's safer not to */
 	if (mce_in_progress())
@@ -109,6 +99,53 @@ void arch_trigger_all_cpu_backtrace(void)
 		mdelay(1);
 	}
 }
+
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+	int cpu = smp_processor_id();
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+		static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+		arch_spin_lock(&lock);
+		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+		show_regs(regs);
+		dump_stack();
+		arch_spin_unlock(&lock);
+		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+		return NOTIFY_STOP;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static __read_mostly struct notifier_block backtrace_notifier = {
+	.notifier_call          = arch_trigger_all_cpu_backtrace_handler,
+	.next                   = NULL,
+	.priority               = 1
+};
+
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+	register_die_notifier(&backtrace_notifier);
+	return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
 #endif
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
-- 
cgit v1.2.3-70-g09d2


From 10f9014912a2b1cb59c39cdea777e6d9afa8f17e Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:49 -0400
Subject: x86: Cleanup hw_nmi.c cruft

The design of the hardlockup watchdog has changed and cruft was left
behind in the hw_nmi.c file.  Just remove the code that isn't used
anymore.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-7-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/hw_nmi.c | 58 -------------------------------------------
 1 file changed, 58 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 8c3edfb89c2..3b40082f037 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -9,74 +9,16 @@
  *
  */
 
-#include <asm/apic.h>
-#include <linux/smp.h>
 #include <linux/cpumask.h>
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <asm/mce.h>
 #include <linux/kdebug.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
-
-
 #include <linux/nmi.h>
 #include <linux/module.h>
 
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-	unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs;
-
-#if defined(CONFIG_X86_LOCAL_APIC)
-	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
-#endif
-
-	return irqs;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
-	return atomic_read(&mce_entry) > 0;
-#endif
-	return 0;
-}
-
-int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
-{
-	unsigned int sum;
-	int cpu = smp_processor_id();
-
-	/* if we are doing an mce, just assume the cpu is not stuck */
-	/* Could check oops_in_progress here too, but it's safer not to */
-	if (mce_in_progress())
-		return 0;
-
-	/* We determine if the cpu is stuck by checking whether any
-	 * interrupts have happened since we last checked.  Of course
-	 * an nmi storm could create false positives, but the higher
-	 * level logic should account for that
-	 */
-	sum = get_timer_irqs(cpu);
-	if (__get_cpu_var(last_irq_sum) == sum) {
-		return 1;
-	} else {
-		__get_cpu_var(last_irq_sum) = sum;
-		return 0;
-	}
-}
-
 u64 hw_nmi_get_sample_period(void)
 {
 	return (u64)(cpu_khz) * 1000 * 60;
-- 
cgit v1.2.3-70-g09d2


From 5e85391b3badd3f0e50ebdd0cafe0202a979f73a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 13 May 2010 09:12:39 +0200
Subject: x86, watchdog: Fix build error in hw_nmi.c

On some configs the following build error triggers:

 arch/x86/kernel/apic/hw_nmi.c:35: error: 'apic' undeclared (first use in this function)
 arch/x86/kernel/apic/hw_nmi.c:35: error: (Each undeclared identifier is reported only once
 arch/x86/kernel/apic/hw_nmi.c:35: error: for each function it appears in.)

Because asm/apic.h was only included implicitly. Include it explicitly.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <1273713674-8434-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/hw_nmi.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 3b40082f037..cefd6942f0e 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -8,6 +8,7 @@
  *  Bits copied from original nmi.c file
  *
  */
+#include <asm/apic.h>
 
 #include <linux/cpumask.h>
 #include <linux/kdebug.h>
-- 
cgit v1.2.3-70-g09d2


From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 14 May 2010 11:11:21 -0400
Subject: lockup_detector: Cross arch compile fixes

Combining the softlockup and hardlockup code causes watchdog.c
to build even without the hardlockup detection support.

So if an arch, that has the previous and the new nmi watchdog
implementations cohabiting, wants to know if the generic one
is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check.
We need to use CONFIG_HARDLOCKUP_DETECTOR instead.

Fixes:
	kernel/built-in.o: In function `touch_nmi_watchdog':
	(.text+0x449bc): multiple definition of `touch_nmi_watchdog'
	arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <20100514151121.GR15159@redhat.com>
[ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/Makefile | 4 ++--
 include/linux/nmi.h           | 2 +-
 kernel/watchdog.c             | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 52f32e0ea19..910f20b457c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index abd48aacaf7..06aab5eee13 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_LOCKUP_DETECTOR
+#ifndef CONFIG_HARDLOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 83fb63155cb..e53622c1465 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,13 +31,13 @@ int watchdog_enabled;
 int __read_mostly softlockup_thresh = 60;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
@@ -139,6 +139,7 @@ void touch_all_softlockup_watchdogs(void)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 }
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 void touch_nmi_watchdog(void)
 {
 	__get_cpu_var(watchdog_nmi_touch) = true;
@@ -146,6 +147,8 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
+#endif
+
 void touch_softlockup_watchdog_sync(void)
 {
 	__raw_get_cpu_var(softlockup_touch_sync) = true;
-- 
cgit v1.2.3-70-g09d2


From 1dedefd1a066a795a87afca9c0236e1a94de9bf6 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 May 2010 12:01:23 -0700
Subject: x86: detect scattered cpuid features earlier

Some extra CPU features such as ARAT is needed in early boot so
that x86_init function pointers can be set up properly.
http://lkml.org/lkml/2010/5/18/519
At start_kernel() level, this patch moves init_scattered_cpuid_features()
from check_bugs() to setup_arch() -> early_cpu_init() which is earlier than
platform specific x86_init layer setup. Suggested by HPA.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1274295685-6774-2-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1c00d0b169..284bf89ddae 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -576,6 +576,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+	init_scattered_cpuid_features(c);
 }
 
 static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +732,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 
 	get_model_name(c); /* Default name */
 
-	init_scattered_cpuid_features(c);
 	detect_nopl(c);
 }
 
-- 
cgit v1.2.3-70-g09d2


From a0c173bd8a3fd0541be8e4ef962170e48d8811c7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 May 2010 12:01:24 -0700
Subject: x86, mrst: add cpu type detection

Medfield is the follow-up of Moorestown, it is treated under the same
HW sub-architecture. However, we do need to know the CPU type in order
for some of the driver to act accordingly.
We also have different optimal clock configuration for each CPU type.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/mrst.h | 19 +++++++++++++++++++
 arch/x86/kernel/mrst.c      | 26 ++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62..dc5c8500bfc 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -11,8 +11,27 @@
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
+extern int mrst_identify_cpu(void);
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
+/*
+ * Medfield is the follow-up of Moorestown, it combines two chip solution into
+ * one. Other than that it also added always-on and constant tsc and lapic
+ * timers. Medfield is the platform name, and the chip name is called Penwell
+ * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
+ * identified via MSRs.
+ */
+enum mrst_cpu_type {
+	MRST_CPU_CHIP_LINCROFT = 1,
+	MRST_CPU_CHIP_PENWELL,
+};
+
+enum mrst_timer_options {
+	MRST_TIMER_DEFAULT,
+	MRST_TIMER_APBT_ONLY,
+	MRST_TIMER_LAPIC_APBT,
+};
+
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index e796448f0eb..ceaebeb5866 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -27,6 +27,8 @@
 
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+static int mrst_cpu_chip;
+
 int sfi_mtimer_num;
 
 struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -216,6 +218,28 @@ static void __init mrst_setup_boot_clock(void)
 		setup_boot_APIC_clock();
 };
 
+int mrst_identify_cpu(void)
+{
+	return mrst_cpu_chip;
+}
+EXPORT_SYMBOL_GPL(mrst_identify_cpu);
+
+void __cpuinit mrst_arch_setup(void)
+{
+	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+		mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
+		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+	else {
+		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+			boot_cpu_data.x86, boot_cpu_data.x86_model);
+		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+	}
+	pr_debug("Moorestown CPU %s identified\n",
+		(mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+		"Lincroft" : "Penwell");
+}
+
 /*
  * Moorestown specific x86_init function overrides and early setup
  * calls.
@@ -230,6 +254,8 @@ void __init x86_mrst_early_setup(void)
 
 	x86_init.irqs.pre_vector_init = x86_init_noop;
 
+	x86_init.oem.arch_setup = mrst_arch_setup;
+
 	x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
 
 	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
-- 
cgit v1.2.3-70-g09d2


From a875c01944f0d750eeb1ef3133feceb13f13c4b3 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 19 May 2010 12:01:25 -0700
Subject: x86, mrst: add more timer config options

Always-on local APIC timer (ARAT) has been introduced to Medfield, along
with the platform APB timers we have more timer configuration options
between Moorestown and Medfield.

This patch adds run-time detection of avaiable timer features so that
we can treat Medfield as a variant of Moorestown and set up the optimal
timer options for each platform. i.e.

Medfield: per cpu always-on local APIC timer
Moorestown: per cpu APB timer

Manual override is possible via cmdline option x86_mrst_timer.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/apb_timer.h |  1 -
 arch/x86/include/asm/mrst.h      |  1 +
 arch/x86/kernel/apb_timer.c      | 37 ++++-------------
 arch/x86/kernel/mrst.c           | 88 ++++++++++++++++++++++++++++------------
 4 files changed, 72 insertions(+), 55 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe57..a69b1ac9eaf 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void);
 extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
 extern void apbt_setup_secondary_clock(void);
 extern unsigned int boot_cpu_id;
-extern int disable_apbt_percpu;
 
 extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
 extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index dc5c8500bfc..67ad3154577 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -12,6 +12,7 @@
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
 extern int mrst_identify_cpu(void);
+extern int mrst_timer_options __cpuinitdata;
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
 /*
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d3..8dd77800ff5 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
 
 #include <asm/fixmap.h>
 #include <asm/apb_timer.h>
+#include <asm/mrst.h>
 
 #define APBT_MASK			CLOCKSOURCE_MASK(32)
 #define APBT_SHIFT			22
-#define APBT_CLOCKEVENT_RATING		150
+#define APBT_CLOCKEVENT_RATING		110
 #define APBT_CLOCKSOURCE_RATING		250
 #define APBT_MIN_DELTA_USEC		200
 
@@ -83,8 +84,6 @@ struct apbt_dev {
 	char name[10];
 };
 
-int disable_apbt_percpu __cpuinitdata;
-
 static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
 
 #ifdef CONFIG_SMP
@@ -194,29 +193,6 @@ static struct clock_event_device apbt_clockevent = {
 	.rating		= APBT_CLOCKEVENT_RATING,
 };
 
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp("apbt_only", arg) == 0)
-		disable_apbt_percpu = 0;
-	else if (strcmp("lapic_and_apbt", arg) == 0)
-		disable_apbt_percpu = 1;
-	else {
-		pr_warning("X86 MRST timer option %s not recognised"
-			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
-			   arg);
-		return -EINVAL;
-	}
-	return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
 /*
  * start count down from 0xffff_ffff. this is done by toggling the enable bit
  * then load initial load count to ~0.
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
 	adev->num = smp_processor_id();
 	memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
 
-	if (disable_apbt_percpu) {
+	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
 		apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
 		global_clock_event = &adev->evt;
 		printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 static __init int apbt_late_init(void)
 {
-	if (disable_apbt_percpu || !apb_timer_block_enabled)
+	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+		!apb_timer_block_enabled)
 		return 0;
 	/* This notifier should be called after workqueue is ready */
 	hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
 	int timer_num;
 	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
 
+	BUG_ON(!apbt_virt_address);
+
 	timer_num = adev->num;
 	pr_debug("%s CPU %d timer %d mode=%d\n",
 		 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
 	}
 #ifdef CONFIG_SMP
 	/* kernel cmdline disable apb timer, so we will use lapic timers */
-	if (disable_apbt_percpu) {
+	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
 		printk(KERN_INFO "apbt: disabled per cpu timer\n");
 		return;
 	}
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index ceaebeb5866..636b53bd419 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,6 +25,29 @@
 #include <asm/i8259.h>
 #include <asm/apb_timer.h>
 
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+int mrst_timer_options __cpuinitdata;
+
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
 static int mrst_cpu_chip;
@@ -169,18 +192,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 	return 0;
 }
 
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
-	/* restore default lapic clock if disabled by cmdline */
-	if (disable_apbt_percpu)
-		return setup_secondary_APIC_clock();
-	apbt_setup_secondary_clock();
-}
-
 static unsigned long __init mrst_calibrate_tsc(void)
 {
 	unsigned long flags, fast_calibrate;
@@ -197,6 +208,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
 
 void __init mrst_time_init(void)
 {
+	switch (mrst_timer_options) {
+	case MRST_TIMER_APBT_ONLY:
+		break;
+	case MRST_TIMER_LAPIC_APBT:
+		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+		break;
+	default:
+		if (!boot_cpu_has(X86_FEATURE_ARAT))
+			break;
+		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+		return;
+	}
+	/* we need at least one APB timer */
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
 	pre_init_apic_IRQ0();
 	apbt_time_init();
@@ -207,17 +233,6 @@ void __init mrst_rtc_init(void)
 	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
 }
 
-/*
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
-{
-	pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
-	if (disable_apbt_percpu)
-		setup_boot_APIC_clock();
-};
-
 int mrst_identify_cpu(void)
 {
 	return mrst_cpu_chip;
@@ -250,13 +265,13 @@ void __init x86_mrst_early_setup(void)
 	x86_init.resources.reserve_resources = x86_init_noop;
 
 	x86_init.timers.timer_init = mrst_time_init;
-	x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+	x86_init.timers.setup_percpu_clockev = x86_init_noop;
 
 	x86_init.irqs.pre_vector_init = x86_init_noop;
 
 	x86_init.oem.arch_setup = mrst_arch_setup;
 
-	x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+	x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
 
 	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
 	x86_init.pci.init = pci_mrst_init;
@@ -269,3 +284,26 @@ void __init x86_mrst_early_setup(void)
 	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
 
 }
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp("apbt_only", arg) == 0)
+		mrst_timer_options = MRST_TIMER_APBT_ONLY;
+	else if (strcmp("lapic_and_apbt", arg) == 0)
+		mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+	else {
+		pr_warning("X86 MRST timer option %s not recognised"
+			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+			   arg);
+		return -EINVAL;
+	}
+	return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-- 
cgit v1.2.3-70-g09d2


From a75af580bb1fd261bf63cc00e4b324e17ceb15cf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 19 May 2010 13:40:14 -0700
Subject: x86, mrst: make mrst_identify_cpu() an inline returning enum

We have an enum, might as well use it.  While we're at it, make it an
inline... there is really no point in calling a function for this
stuff.

LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/mrst.h |  7 ++++++-
 arch/x86/kernel/mrst.c      | 17 ++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 67ad3154577..1869c18d15c 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
-extern int mrst_identify_cpu(void);
 extern int mrst_timer_options __cpuinitdata;
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
@@ -27,6 +26,12 @@ enum mrst_cpu_type {
 	MRST_CPU_CHIP_PENWELL,
 };
 
+extern enum mrst_cpu_type __mrst_cpu_chip;
+static enum mrst_cpu_type mrst_identify_cpu(void)
+{
+	return __mrst_cpu_chip;
+}
+
 enum mrst_timer_options {
 	MRST_TIMER_DEFAULT,
 	MRST_TIMER_APBT_ONLY,
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 636b53bd419..967f2686adb 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -50,7 +50,8 @@ int mrst_timer_options __cpuinitdata;
 
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-static int mrst_cpu_chip;
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
 
 int sfi_mtimer_num;
 
@@ -233,25 +234,19 @@ void __init mrst_rtc_init(void)
 	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
 }
 
-int mrst_identify_cpu(void)
-{
-	return mrst_cpu_chip;
-}
-EXPORT_SYMBOL_GPL(mrst_identify_cpu);
-
 void __cpuinit mrst_arch_setup(void)
 {
 	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-		mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
 	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
-		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
 	else {
 		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
 			boot_cpu_data.x86, boot_cpu_data.x86_model);
-		mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
 	}
 	pr_debug("Moorestown CPU %s identified\n",
-		(mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+		(__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
 		"Lincroft" : "Penwell");
 }
 
-- 
cgit v1.2.3-70-g09d2


From 14671386dcbafb3086bbda3cb6f9f27d34c7bf6d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 19 May 2010 14:37:40 -0700
Subject: x86, mrst: make mrst_timer_options an enum

We have an enum mrst_timer_options, use it so that the kernel knows if
we're missing something from a switch statement or equivalent.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/mrst.h | 3 ++-
 arch/x86/kernel/mrst.c      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 1869c18d15c..16350740edf 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -11,7 +11,6 @@
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
 extern int pci_mrst_init(void);
-extern int mrst_timer_options __cpuinitdata;
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
 /*
@@ -38,6 +37,8 @@ enum mrst_timer_options {
 	MRST_TIMER_LAPIC_APBT,
 };
 
+extern enum mrst_timer_options mrst_timer_options;
+
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 967f2686adb..7ee4ed901ba 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -46,7 +46,7 @@
  * lapic (always-on,ARAT) ------ 150
  */
 
-int mrst_timer_options __cpuinitdata;
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
 
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-- 
cgit v1.2.3-70-g09d2


From 12a6611fa16e9c6d2f844fe2175d219c6e9bd95d Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:01 -0500
Subject: x86, UV: Calculate BAU destination timeout

Calculate the Broadcast Assist Unit's destination timeout period from the
values in the relevant MMR's.

Store it in each cpu's per-cpu BAU structure so that a destination
timeout can be differentiated from a 'plugged' situation in which all
software ack resources are already allocated and a timeout is pending.
That case returns an immediate destination error.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNx-0004Zq-RK@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h | 12 ++++++++++
 arch/x86/kernel/tlb_uv.c         | 51 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index aa558ac0306..458e04c626a 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -49,6 +49,18 @@
 #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+/* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+/* [30:28] URGENCY_7  an index into a table of times */
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
+
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+/* [45:40] BAU - BAU transaction timeout select - a multiplier */
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
 
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e..5506836c4a8 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -30,6 +30,19 @@ struct msg_desc {
 	struct bau_payload_queue_entry *va_queue_last;
 };
 
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static int timeout_base_ns[] = {
+		20,
+		160,
+		1280,
+		10240,
+		81920,
+		655360,
+		5242880,
+		167772160
+};
+static int timeout_us;
+
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
 
 static int uv_bau_max_concurrent __read_mostly;
@@ -423,7 +436,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 			 * pending.  In that case hardware returns the
 			 * ERROR that looks like a destination timeout.
 			 */
-			if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+			if (cycles_2_us(ttime - bcp->send_message) <
+							timeout_us) {
 				bcp->conseccompletes = 0;
 				return FLUSH_RETRY_PLUGGED;
 			}
@@ -908,12 +922,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 }
 
 static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
+microsec_2_cycles(unsigned long microsec)
 {
 	unsigned long ns;
 	unsigned long long cyc;
 
-	ns = millisec * 1000;
+	ns = microsec * 1000;
 	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
 	return cyc;
 }
@@ -1258,6 +1272,33 @@ static void __init uv_init_uvhub(int uvhub, int vector)
 				      ((apicid << 32) | vector));
 }
 
+/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+	unsigned long mmr_image;
+	int mult1;
+	int mult2;
+	int index;
+	int base;
+	int ret;
+	unsigned long ts_ns;
+
+	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+	base = timeout_base_ns[index];
+	ts_ns = base * mult1 * mult2;
+	ret = ts_ns / 1000;
+	return ret;
+}
+
 /*
  * initialize the bau_control structure for each cpu
  */
@@ -1286,6 +1327,8 @@ static void uv_init_per_cpu(int nuvhubs)
 	};
 	struct uvhub_desc *uvhub_descs;
 
+	timeout_us = calculate_destination_timeout();
+
 	uvhub_descs = (struct uvhub_desc *)
 		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
 	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
@@ -1301,7 +1344,7 @@ static void uv_init_per_cpu(int nuvhubs)
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
 		/* time interval to catch a hardware stay-busy bug */
-		bcp->timeout_interval = millisec_2_cycles(3);
+		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		/* kludge: assume uv_hub.h is constant */
 		socket = (cpu_physical_id(cpu)>>5)&1;
 		if (socket >= bdp->num_sockets)
-- 
cgit v1.2.3-70-g09d2


From e8e5e8a8048006a12d7777a93baebd6e39496101 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:01 -0500
Subject: x86, UV: BAU tunables into a debugfs file

Make the Broadcast Assist Unit driver's nine tuning values variable by
making them accessible through a read/write debugfs file.

The file will normally be mounted as
/sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each
cpu's per-cpu BAU structure.

The patch also does a little name improvement, and corrects the reset of
two destination timeout counters.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNx-0004Zx-Uo@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  53 +++++---
 arch/x86/kernel/tlb_uv.c         | 281 +++++++++++++++++++++++++++++++++------
 2 files changed, 278 insertions(+), 56 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 458e04c626a..e5543c1a80c 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -45,10 +45,14 @@
 #define UV_DESC_BASE_PNODE_SHIFT	49
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
+#define UV_BAU_BASENAME			"sgi_uv/bau_tunables"
+#define UV_BAU_TUNABLES_DIR		"sgi_uv"
+#define UV_BAU_TUNABLES_FILE		"bau_tunables"
+#define WHITESPACE			" \t\n"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
 #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
 /* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
 #define BAU_MISC_CONTROL_MULT_MASK 3
 
@@ -70,25 +74,23 @@
 #define DESC_STATUS_DESTINATION_TIMEOUT	2
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
+#define TIMEOUT_DELAY			10
 /*
- * source side threshholds at which message retries print a warning
- */
-#define SOURCE_TIMEOUT_LIMIT		20
-#define DESTINATION_TIMEOUT_LIMIT	20
-
-/*
- * misc. delays, in microseconds
+ * delay for 'plugged' timeout retries, in microseconds
  */
-#define THROTTLE_DELAY			10
-#define TIMEOUT_DELAY			10
-#define BIOS_TO				1000
-/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+#define PLUGGED_DELAY			10
 
 /*
  * threshholds at which to use IPI to free resources
  */
+/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
 #define PLUGSB4RESET 100
-#define TIMEOUTSB4RESET 100
+/* after this many consecutive timeouts, use IPI to release resources */
+#define TIMEOUTSB4RESET 1
+/* at this number uses of IPI to release resources, giveup the request */
+#define IPI_RESET_LIMIT 1
+/* after this # consecutive successes, bump up the throttle if it was lowered */
+#define COMPLETE_THRESHOLD 5
 
 /*
  * number of entries in the destination side payload queue
@@ -107,6 +109,13 @@
 #define FLUSH_GIVEUP			3
 #define FLUSH_COMPLETE			4
 
+/*
+ * tuning the action when the numalink network is extremely delayed
+ */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
+#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+
 /*
  * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
  * If the 'multilevel' flag in the header portion of the descriptor
@@ -323,14 +332,13 @@ struct bau_control {
 	struct bau_control *uvhub_master;
 	struct bau_control *socket_master;
 	unsigned long timeout_interval;
+	unsigned long set_bau_on_time;
 	atomic_t active_descriptor_count;
-	int max_concurrent;
-	int max_concurrent_constant;
-	int retry_message_scans;
 	int plugged_tries;
 	int timeout_tries;
 	int ipi_attempts;
 	int conseccompletes;
+	int set_bau_off;
 	short cpu;
 	short uvhub_cpu;
 	short uvhub;
@@ -343,6 +351,19 @@ struct bau_control {
 	spinlock_t masks_lock;
 	spinlock_t uvhub_lock;
 	spinlock_t queue_lock;
+	/* tunables */
+	int max_bau_concurrent;
+	int max_bau_concurrent_constant;
+	int plugged_delay;
+	int plugsb4reset;
+	int timeoutsb4reset;
+	int ipi_reset_limit;
+	int complete_threshold;
+	int congested_response_us;
+	int congested_reps;
+	int congested_period;
+	cycles_t period_time;
+	long period_requests;
 };
 
 /*
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 5506836c4a8..c8661779c51 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
  */
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 
@@ -42,12 +43,22 @@ static int timeout_base_ns[] = {
 		167772160
 };
 static int timeout_us;
+static int nobau;
 
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
-
-static int uv_bau_max_concurrent __read_mostly;
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
 
-static int nobau;
 static int __init setup_nobau(char *arg)
 {
 	nobau = 1;
@@ -539,23 +550,24 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
+	cycles_t elapsed;
 	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
 	/*
-	 * Spin here while there are hmaster->max_concurrent or more active
+	 * Spin here while there are hmaster->max_bau_concurrent or more active
 	 * descriptors. This is the per-uvhub 'throttle'.
 	 */
 	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
-			hmaster->max_concurrent)) {
+			hmaster->max_bau_concurrent)) {
 		stat->s_throttles++;
 		do {
 			cpu_relax();
 		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
-			hmaster->max_concurrent));
+			hmaster->max_bau_concurrent));
 	}
 
 	while (hmaster->uvhub_quiesce)
@@ -609,9 +621,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			 * that case hardware immediately returns the ERROR
 			 * that looks like a destination timeout.
 			 */
-			udelay(TIMEOUT_DELAY);
+			udelay(bcp->plugged_delay);
 			bcp->plugged_tries++;
-			if (bcp->plugged_tries >= PLUGSB4RESET) {
+			if (bcp->plugged_tries >= bcp->plugsb4reset) {
 				bcp->plugged_tries = 0;
 				quiesce_local_uvhub(hmaster);
 				spin_lock(&hmaster->queue_lock);
@@ -623,10 +635,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 				stat->s_resets_plug++;
 			}
 		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-			hmaster->max_concurrent = 1;
+			hmaster->max_bau_concurrent = 1;
 			bcp->timeout_tries++;
 			udelay(TIMEOUT_DELAY);
-			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+			if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
 				bcp->timeout_tries = 0;
 				quiesce_local_uvhub(hmaster);
 				spin_lock(&hmaster->queue_lock);
@@ -638,7 +650,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 				stat->s_resets_timeout++;
 			}
 		}
-		if (bcp->ipi_attempts >= 3) {
+		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
 			bcp->ipi_attempts = 0;
 			completion_status = FLUSH_GIVEUP;
 			break;
@@ -648,9 +660,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
 
-	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
-	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
-			hmaster->max_concurrent++;
+	bcp->plugged_tries = 0;
+	bcp->timeout_tries = 0;
+
+	if ((completion_status == FLUSH_COMPLETE) &&
+	    (bcp->conseccompletes > bcp->complete_threshold) &&
+	    (hmaster->max_bau_concurrent <
+					hmaster->max_bau_concurrent_constant))
+			hmaster->max_bau_concurrent++;
 
 	/*
 	 * hold any cpu not timing out here; no other cpu currently held by
@@ -661,9 +678,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	atomic_dec(&hmaster->active_descriptor_count);
 
 	/* guard against cycles wrap */
-	if (time2 > time1)
-		stat->s_time += (time2 - time1);
-	else
+	if (time2 > time1) {
+		elapsed = time2 - time1;
+		stat->s_time += elapsed;
+	} else
 		stat->s_requestor--; /* don't count this one */
 	if (completion_status == FLUSH_COMPLETE && try > 1)
 		stat->s_retriesok++;
@@ -730,10 +748,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
 
+	/* kernel was booted 'nobau' */
 	if (nobau)
 		return cpumask;
 
 	bcp = &per_cpu(bau_control, cpu);
+
 	/*
 	 * Each sending cpu has a per-cpu mask which it fills from the caller's
 	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
@@ -970,6 +990,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 			   stat->s_resets_plug, stat->s_resets_timeout,
 			   stat->s_giveup, stat->s_stimeout,
 			   stat->s_busy, stat->s_throttles);
+
 		/* destination side statistics */
 		seq_printf(file,
 			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
@@ -985,10 +1006,29 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 	return 0;
 }
 
+/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	char buf[300];
+	int ret;
+
+	ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+		"max_bau_concurrent plugged_delay plugsb4reset",
+		"timeoutsb4reset ipi_reset_limit complete_threshold",
+		"congested_response_us congested_reps congested_period",
+		max_bau_concurrent, plugged_delay, plugsb4reset,
+		timeoutsb4reset, ipi_reset_limit, complete_threshold,
+		congested_response_us, congested_reps, congested_period);
+
+	return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+}
+
 /*
  * -1: resetf the statistics
  *  0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 				 size_t count, loff_t *data)
@@ -997,7 +1037,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 	long input_arg;
 	char optstr[64];
 	struct ptc_stats *stat;
-	struct bau_control *bcp;
 
 	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
@@ -1078,24 +1117,149 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 			stat = &per_cpu(ptcstats, cpu);
 			memset(stat, 0, sizeof(struct ptc_stats));
 		}
-	} else {
-		uv_bau_max_concurrent = input_arg;
-		bcp = &per_cpu(bau_control, smp_processor_id());
-		if (uv_bau_max_concurrent < 1 ||
-		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
-			printk(KERN_DEBUG
-				"Error: BAU max concurrent %d; %d is invalid\n",
-				bcp->max_concurrent, uv_bau_max_concurrent);
-			return -EINVAL;
-		}
-		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
-		       uv_bau_max_concurrent);
-		for_each_present_cpu(cpu) {
-			bcp = &per_cpu(bau_control, cpu);
-			bcp->max_concurrent = uv_bau_max_concurrent;
+	}
+
+	return count;
+}
+
+static int local_atoi(const char *name)
+{
+	int val = 0;
+
+	for (;; name++) {
+		switch (*name) {
+		case '0' ... '9':
+			val = 10*val+(*name-'0');
+			break;
+		default:
+			return val;
 		}
 	}
+}
+
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+				 size_t count, loff_t *data)
+{
+	int cpu;
+	int cnt = 0;
+	int val;
+	char *p;
+	char *q;
+	char instr[64];
+	struct bau_control *bcp;
 
+	if (count == 0 || count > sizeof(instr)-1)
+		return -EINVAL;
+	if (copy_from_user(instr, user, count))
+		return -EFAULT;
+
+	instr[count] = '\0';
+	/* count the fields */
+	p = instr + strspn(instr, WHITESPACE);
+	q = p;
+	for (; *p; p = q + strspn(q, WHITESPACE)) {
+		q = p + strcspn(p, WHITESPACE);
+		cnt++;
+		if (q == p)
+			break;
+	}
+	if (cnt != 9) {
+		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+		return -EINVAL;
+	}
+
+	p = instr + strspn(instr, WHITESPACE);
+	q = p;
+	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+		q = p + strcspn(p, WHITESPACE);
+		val = local_atoi(p);
+		switch (cnt) {
+		case 0:
+			if (val == 0) {
+				max_bau_concurrent = MAX_BAU_CONCURRENT;
+				max_bau_concurrent_constant =
+							MAX_BAU_CONCURRENT;
+				continue;
+			}
+			bcp = &per_cpu(bau_control, smp_processor_id());
+			if (val < 1 || val > bcp->cpus_in_uvhub) {
+				printk(KERN_DEBUG
+				"Error: BAU max concurrent %d is invalid\n",
+				val);
+				return -EINVAL;
+			}
+			max_bau_concurrent = val;
+			max_bau_concurrent_constant = val;
+			continue;
+		case 1:
+			if (val == 0)
+				plugged_delay = PLUGGED_DELAY;
+			else
+				plugged_delay = val;
+			continue;
+		case 2:
+			if (val == 0)
+				plugsb4reset = PLUGSB4RESET;
+			else
+				plugsb4reset = val;
+			continue;
+		case 3:
+			if (val == 0)
+				timeoutsb4reset = TIMEOUTSB4RESET;
+			else
+				timeoutsb4reset = val;
+			continue;
+		case 4:
+			if (val == 0)
+				ipi_reset_limit = IPI_RESET_LIMIT;
+			else
+				ipi_reset_limit = val;
+			continue;
+		case 5:
+			if (val == 0)
+				complete_threshold = COMPLETE_THRESHOLD;
+			else
+				complete_threshold = val;
+			continue;
+		case 6:
+			if (val == 0)
+				congested_response_us = CONGESTED_RESPONSE_US;
+			else
+				congested_response_us = val;
+			continue;
+		case 7:
+			if (val == 0)
+				congested_reps = CONGESTED_REPS;
+			else
+				congested_reps = val;
+			continue;
+		case 8:
+			if (val == 0)
+				congested_period = CONGESTED_PERIOD;
+			else
+				congested_period = val;
+			continue;
+		}
+		if (q == p)
+			break;
+	}
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->max_bau_concurrent = max_bau_concurrent;
+		bcp->max_bau_concurrent_constant = max_bau_concurrent;
+		bcp->plugged_delay = plugged_delay;
+		bcp->plugsb4reset = plugsb4reset;
+		bcp->timeoutsb4reset = timeoutsb4reset;
+		bcp->ipi_reset_limit = ipi_reset_limit;
+		bcp->complete_threshold = complete_threshold;
+		bcp->congested_response_us = congested_response_us;
+		bcp->congested_reps = congested_reps;
+		bcp->congested_period = congested_period;
+	}
 	return count;
 }
 
@@ -1111,6 +1275,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
 	return seq_open(file, &uv_ptc_seq_ops);
 }
 
+static int tunables_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
 static const struct file_operations proc_uv_ptc_operations = {
 	.open		= uv_ptc_proc_open,
 	.read		= seq_read,
@@ -1119,6 +1288,12 @@ static const struct file_operations proc_uv_ptc_operations = {
 	.release	= seq_release,
 };
 
+static const struct file_operations tunables_fops = {
+	.open		= tunables_open,
+	.read		= tunables_read,
+	.write		= tunables_write,
+};
+
 static int __init uv_ptc_init(void)
 {
 	struct proc_dir_entry *proc_uv_ptc;
@@ -1133,6 +1308,20 @@ static int __init uv_ptc_init(void)
 		       UV_PTC_BASENAME);
 		return -EINVAL;
 	}
+
+	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+	if (!tunables_dir) {
+		printk(KERN_ERR "unable to create debugfs directory %s\n",
+		       UV_BAU_TUNABLES_DIR);
+		return -EINVAL;
+	}
+	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+			tunables_dir, NULL, &tunables_fops);
+	if (!tunables_file) {
+		printk(KERN_ERR "unable to create debugfs file %s\n",
+		       UV_BAU_TUNABLES_FILE);
+		return -EINVAL;
+	}
 	return 0;
 }
 
@@ -1336,15 +1525,12 @@ static void uv_init_per_cpu(int nuvhubs)
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
 		spin_lock_init(&bcp->masks_lock);
-		bcp->max_concurrent = uv_bau_max_concurrent;
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
 		bdp = &uvhub_descs[uvhub];
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
-		/* time interval to catch a hardware stay-busy bug */
-		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		/* kludge: assume uv_hub.h is constant */
 		socket = (cpu_physical_id(cpu)>>5)&1;
 		if (socket >= bdp->num_sockets)
@@ -1380,6 +1566,21 @@ static void uv_init_per_cpu(int nuvhubs)
 		}
 	}
 	kfree(uvhub_descs);
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+		bcp->max_bau_concurrent = max_bau_concurrent;
+		bcp->max_bau_concurrent_constant = max_bau_concurrent;
+		bcp->plugged_delay = plugged_delay;
+		bcp->plugsb4reset = plugsb4reset;
+		bcp->timeoutsb4reset = timeoutsb4reset;
+		bcp->ipi_reset_limit = ipi_reset_limit;
+		bcp->complete_threshold = complete_threshold;
+		bcp->congested_response_us = congested_response_us;
+		bcp->congested_reps = congested_reps;
+		bcp->congested_period = congested_period;
+	}
 }
 
 /*
@@ -1404,7 +1605,7 @@ static int __init uv_bau_init(void)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+	max_bau_concurrent = MAX_BAU_CONCURRENT;
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
@@ -1437,4 +1638,4 @@ static int __init uv_bau_init(void)
 	return 0;
 }
 core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
+fs_initcall(uv_ptc_init);
-- 
cgit v1.2.3-70-g09d2


From 50fb55acc5bbe5ee29d0a65262f4ec286b14d156 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Disable BAU on network congestion

The numalink network can become so congested that TLB shootdown
using the Broadcast Assist Unit becomes slower than using IPI's.

In that case, disable the use of the BAU for a period of time.
The period is tunable.  When the period expires the use of the
BAU is re-enabled. A count of these actions is added to the
statistics file.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004a4-0a@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  4 +++
 arch/x86/kernel/tlb_uv.c         | 76 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index e5543c1a80c..9b3e750ef2d 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -34,6 +34,7 @@
  */
 
 #define UV_ITEMS_PER_DESCRIPTOR		8
+/* the 'throttle' to prevent the hardware stay-busy bug */
 #define MAX_BAU_CONCURRENT		3
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
@@ -338,6 +339,7 @@ struct bau_control {
 	int timeout_tries;
 	int ipi_attempts;
 	int conseccompletes;
+	int baudisabled;
 	int set_bau_off;
 	short cpu;
 	short uvhub_cpu;
@@ -389,6 +391,8 @@ struct ptc_stats {
 	unsigned long s_busy; /* status stayed busy past s/w timer */
 	unsigned long s_throttles; /* waits in throttle */
 	unsigned long s_retry_messages; /* retry broadcasts */
+	unsigned long s_bau_reenabled; /* for bau enable/disable */
+	unsigned long s_bau_disabled; /* for bau enable/disable */
 	/* destination statistics */
 	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
 	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index c8661779c51..dc6a6831275 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -44,6 +44,9 @@ static int timeout_base_ns[] = {
 };
 static int timeout_us;
 static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
 
 /* tunables: */
 static int max_bau_concurrent = MAX_BAU_CONCURRENT;
@@ -519,6 +522,35 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
 	return 1;
 }
 
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+	int tcpu;
+	struct bau_control *tbcp;
+
+	/* let only one cpu do this disabling */
+	spin_lock(&disable_lock);
+	if (!baudisabled && bcp->period_requests &&
+	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+		/* it becomes this cpu's job to turn on the use of the
+		   BAU again */
+		baudisabled = 1;
+		bcp->set_bau_off = 1;
+		bcp->set_bau_on_time = get_cycles() +
+			sec_2_cycles(bcp->congested_period);
+		stat->s_bau_disabled++;
+		for_each_present_cpu(tcpu) {
+			tbcp = &per_cpu(bau_control, tcpu);
+				tbcp->baudisabled = 1;
+		}
+	}
+	spin_unlock(&disable_lock);
+}
+
 /**
  * uv_flush_send_and_wait
  *
@@ -681,6 +713,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	if (time2 > time1) {
 		elapsed = time2 - time1;
 		stat->s_time += elapsed;
+		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+			bcp->period_requests++;
+			bcp->period_time += elapsed;
+			if ((elapsed > congested_cycles) &&
+			    (bcp->period_requests > bcp->congested_reps)) {
+				disable_for_congestion(bcp, stat);
+			}
+		}
 	} else
 		stat->s_requestor--; /* don't count this one */
 	if (completion_status == FLUSH_COMPLETE && try > 1)
@@ -747,12 +787,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
+	struct bau_control *tbcp;
 
 	/* kernel was booted 'nobau' */
 	if (nobau)
 		return cpumask;
 
 	bcp = &per_cpu(bau_control, cpu);
+	stat = &per_cpu(ptcstats, cpu);
+
+	/* bau was disabled due to slow response */
+	if (bcp->baudisabled) {
+		/* the cpu that disabled it must re-enable it */
+		if (bcp->set_bau_off) {
+			if (get_cycles() >= bcp->set_bau_on_time) {
+				stat->s_bau_reenabled++;
+				baudisabled = 0;
+				for_each_present_cpu(tcpu) {
+					tbcp = &per_cpu(bau_control, tcpu);
+					tbcp->baudisabled = 0;
+					tbcp->period_requests = 0;
+					tbcp->period_time = 0;
+				}
+			}
+		}
+		return cpumask;
+	}
 
 	/*
 	 * Each sending cpu has a per-cpu mask which it fills from the caller's
@@ -793,7 +853,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		else
 			return NULL;
 	}
-	stat = &per_cpu(ptcstats, cpu);
 	stat->s_requestor++;
 	stat->s_ntargcpu += remotes;
 	remotes = bau_uvhub_weight(&bau_desc->distribution);
@@ -973,7 +1032,9 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"sw_ack recv rtime all ");
 		seq_printf(file,
-			"one mult none retry canc nocan reset rcan\n");
+			"one mult none retry canc nocan reset rcan ");
+		seq_printf(file,
+			"disable enable\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
@@ -993,7 +1054,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 		/* destination side statistics */
 		seq_printf(file,
-			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
 			   stat->d_requestee, cycles_2_us(stat->d_time),
@@ -1001,6 +1062,8 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
 			   stat->d_nocanceled, stat->d_resets,
 			   stat->d_rcanceled);
+		seq_printf(file, "%ld %ld\n",
+			stat->s_bau_disabled, stat->s_bau_reenabled);
 	}
 
 	return 0;
@@ -1112,6 +1175,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 		"reset:    number of ipi-style reset requests processed\n");
 		printk(KERN_DEBUG
 		"rcan:     number messages canceled by reset requests\n");
+		printk(KERN_DEBUG
+		"disable:  number times use of the BAU was disabled\n");
+		printk(KERN_DEBUG
+		"enable:   number times use of the BAU was re-enabled\n");
 	} else if (input_arg == -1) {
 		for_each_present_cpu(cpu) {
 			stat = &per_cpu(ptcstats, cpu);
@@ -1568,6 +1635,7 @@ static void uv_init_per_cpu(int nuvhubs)
 	kfree(uvhub_descs);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
+		bcp->baudisabled = 0;
 		/* time interval to catch a hardware stay-busy bug */
 		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		bcp->max_bau_concurrent = max_bau_concurrent;
@@ -1609,6 +1677,8 @@ static int __init uv_bau_init(void)
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
+	spin_lock_init(&disable_lock);
+	congested_cycles = microsec_2_cycles(congested_response_us);
 
 	uv_init_per_cpu(nuvhubs);
 
-- 
cgit v1.2.3-70-g09d2


From 712157aa703a01f58c7c17452096ab00b774d0a9 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Shorten access to BAU statistics structure

Use a pointer from the per-cpu BAU control structure to the
per-cpu BAU statistics structure.
We nearly always know the first before needing the second.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aB-2k@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 +
 arch/x86/kernel/tlb_uv.c         | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b3e750ef2d..6a42d42eb8f 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -332,6 +332,7 @@ struct bau_control {
 	struct bau_payload_queue_entry *bau_msg_head;
 	struct bau_control *uvhub_master;
 	struct bau_control *socket_master;
+	struct ptc_stats *statp;
 	unsigned long timeout_interval;
 	unsigned long set_bau_on_time;
 	atomic_t active_descriptor_count;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index dc6a6831275..261b9653cde 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -153,7 +153,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 	struct ptc_stats *stat;
 
 	msg = mdp->msg;
-	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat = bcp->statp;
 	stat->d_retries++;
 	/*
 	 * cancel any message from msg+1 to the retry itself
@@ -217,7 +217,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
 	 * This must be a normal message, or retry of a normal message
 	 */
 	msg = mdp->msg;
-	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat = bcp->statp;
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
 		stat->d_alltlb++;
@@ -301,7 +301,7 @@ uv_do_reset(void *ptr)
 
 	bcp = &per_cpu(bau_control, smp_processor_id());
 	rap = (struct reset_args *)ptr;
-	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat = bcp->statp;
 	stat->d_resets++;
 
 	/*
@@ -419,7 +419,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mask;
 	cycles_t ttime;
 	cycles_t timeout_time;
-	struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster;
 
 	hmaster = bcp->uvhub_master;
@@ -583,7 +583,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	cycles_t time1;
 	cycles_t time2;
 	cycles_t elapsed;
-	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
@@ -794,7 +794,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		return cpumask;
 
 	bcp = &per_cpu(bau_control, cpu);
-	stat = &per_cpu(ptcstats, cpu);
+	stat = bcp->statp;
 
 	/* bau was disabled due to slow response */
 	if (bcp->baudisabled) {
@@ -903,7 +903,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 
 	time_start = get_cycles();
 	bcp = &per_cpu(bau_control, smp_processor_id());
-	stat = &per_cpu(ptcstats, smp_processor_id());
+	stat = bcp->statp;
 	msgdesc.va_queue_first = bcp->va_queue_first;
 	msgdesc.va_queue_last = bcp->va_queue_last;
 	msg = bcp->bau_msg_head;
@@ -1636,6 +1636,7 @@ static void uv_init_per_cpu(int nuvhubs)
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		bcp->baudisabled = 0;
+		bcp->statp = &per_cpu(ptcstats, cpu);
 		/* time interval to catch a hardware stay-busy bug */
 		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
 		bcp->max_bau_concurrent = max_bau_concurrent;
@@ -1673,7 +1674,6 @@ static int __init uv_bau_init(void)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	max_bau_concurrent = MAX_BAU_CONCURRENT;
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
-- 
cgit v1.2.3-70-g09d2


From 4faca1550838708d71f6eea14cdacb0876c3a5a4 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: BAU structure rearranging

Move some structure definitions from the C code to the BAU
header file, and change the organization of that header file a
little.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aI-54@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h | 90 +++++++++++++++++++++++-----------------
 arch/x86/kernel/tlb_uv.c         | 12 ------
 2 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 6a42d42eb8f..1c8f1e9bf74 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -322,6 +322,57 @@ struct bau_payload_queue_entry {
 	/* bytes 24-31 */
 };
 
+struct msg_desc {
+	struct bau_payload_queue_entry *msg;
+	int msg_slot;
+	int sw_ack_slot;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+};
+
+struct reset_args {
+	int sender;
+};
+
+/*
+ * This structure is allocated per_cpu for UV TLB shootdown statistics.
+ */
+struct ptc_stats {
+	/* sender statistics */
+	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
+	unsigned long s_requestor; /* number of shootdown requests */
+	unsigned long s_stimeout; /* source side timeouts */
+	unsigned long s_dtimeout; /* destination side timeouts */
+	unsigned long s_time; /* time spent in sending side */
+	unsigned long s_retriesok; /* successful retries */
+	unsigned long s_ntargcpu; /* total number of cpu's targeted */
+	unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
+	unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
+	unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
+	unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
+	unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
+	unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
+	unsigned long s_resets_plug; /* ipi-style resets from plug state */
+	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
+	unsigned long s_busy; /* status stayed busy past s/w timer */
+	unsigned long s_throttles; /* waits in throttle */
+	unsigned long s_retry_messages; /* retry broadcasts */
+	unsigned long s_bau_reenabled; /* for bau enable/disable */
+	unsigned long s_bau_disabled; /* for bau enable/disable */
+	/* destination statistics */
+	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
+	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+	unsigned long d_multmsg; /* interrupts with multiple messages */
+	unsigned long d_nomsg; /* interrupts with no message */
+	unsigned long d_time; /* time spent on destination side */
+	unsigned long d_requestee; /* number of messages processed */
+	unsigned long d_retries; /* number of retry messages processed */
+	unsigned long d_canceled; /* number of messages canceled by retries */
+	unsigned long d_nocanceled; /* retries that found nothing to cancel */
+	unsigned long d_resets; /* number of ipi-style requests processed */
+	unsigned long d_rcanceled; /* number of messages canceled by resets */
+};
+
 /*
  * one per-cpu; to locate the software tables
  */
@@ -369,45 +420,6 @@ struct bau_control {
 	long period_requests;
 };
 
-/*
- * This structure is allocated per_cpu for UV TLB shootdown statistics.
- */
-struct ptc_stats {
-	/* sender statistics */
-	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
-	unsigned long s_requestor; /* number of shootdown requests */
-	unsigned long s_stimeout; /* source side timeouts */
-	unsigned long s_dtimeout; /* destination side timeouts */
-	unsigned long s_time; /* time spent in sending side */
-	unsigned long s_retriesok; /* successful retries */
-	unsigned long s_ntargcpu; /* number of cpus targeted */
-	unsigned long s_ntarguvhub; /* number of uvhubs targeted */
-	unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
-	unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
-	unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
-	unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
-	unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
-	unsigned long s_resets_plug; /* ipi-style resets from plug state */
-	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
-	unsigned long s_busy; /* status stayed busy past s/w timer */
-	unsigned long s_throttles; /* waits in throttle */
-	unsigned long s_retry_messages; /* retry broadcasts */
-	unsigned long s_bau_reenabled; /* for bau enable/disable */
-	unsigned long s_bau_disabled; /* for bau enable/disable */
-	/* destination statistics */
-	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
-	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
-	unsigned long d_multmsg; /* interrupts with multiple messages */
-	unsigned long d_nomsg; /* interrupts with no message */
-	unsigned long d_time; /* time spent on destination side */
-	unsigned long d_requestee; /* number of messages processed */
-	unsigned long d_retries; /* number of retry messages processed */
-	unsigned long d_canceled; /* number of messages canceled by retries */
-	unsigned long d_nocanceled; /* retries that found nothing to cancel */
-	unsigned long d_resets; /* number of ipi-style requests processed */
-	unsigned long d_rcanceled; /* number of messages canceled by resets */
-};
-
 static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
 	return constant_test_bit(uvhub, &dstp->bits[0]);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 261b9653cde..d7592903984 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,14 +23,6 @@
 #include <asm/irq_vectors.h>
 #include <asm/timer.h>
 
-struct msg_desc {
-	struct bau_payload_queue_entry *msg;
-	int msg_slot;
-	int sw_ack_slot;
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-};
-
 /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
 static int timeout_base_ns[] = {
 		20,
@@ -79,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
 static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
 
-struct reset_args {
-	int sender;
-};
-
 /*
  * Determine the first node on a uvhub. 'Nodes' are used for kernel
  * memory allocation.
-- 
cgit v1.2.3-70-g09d2


From 39847e7f3c8198b14102fe7ba4b3a6a1d84bbcfe Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Correct BAU software acknowledge

Correct the acknowledgment and the reset of a BAU
software-acknowledged message.

A retry message should be testing only for timed-out resources
(mask << 8). (And we delete a log message that might cause
unnecessary concern) The acknowledge MMR is
|--timed-out--|---pending--|,  each is 8 bits.

The IPI-driven reset of software acknowledge resources frees
both timed out and pending resources.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aP-7O@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d7592903984..295a41122da 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -161,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 			slot2 = msg2 - mdp->va_queue_first;
 			mmr = uv_read_local_mmr
 				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = ((msg2->sw_ack_vector << 8) |
-				   msg2->sw_ack_vector);
+			msg_res = msg2->sw_ack_vector;
 			/*
 			 * This is a message retry; clear the resources held
 			 * by the previous message only if they timed out.
 			 * If it has not timed out we have an unexpected
 			 * situation to report.
 			 */
-			if (mmr & (msg_res << 8)) {
+			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
 				/*
 				 * is the resource timed out?
 				 * make everyone ignore the cancelled message.
@@ -179,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 				cancel_count++;
 				uv_write_local_mmr(
 				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-					(msg_res << 8) | msg_res);
-			} else
-				printk(KERN_INFO "note bau retry: no effect\n");
+					(msg_res << UV_SW_ACK_NPENDING) |
+					 msg_res);
+			}
 		}
 	}
 	if (!cancel_count)
@@ -317,13 +316,13 @@ uv_do_reset(void *ptr)
 			 */
 			mmr = uv_read_local_mmr
 					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = ((msg->sw_ack_vector << 8) |
-						   msg->sw_ack_vector);
+			msg_res = msg->sw_ack_vector;
 			if (mmr & msg_res) {
 				stat->d_rcanceled++;
 				uv_write_local_mmr(
 				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-							msg_res);
+					(msg_res << UV_SW_ACK_NPENDING) |
+					 msg_res);
 			}
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From a8328ee58c15c9d763a67607a35bb987b38950fa Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Correct BAU discovery of hubs and sockets

Correct the initialization-time assumption of contigous blade
numbers and of sockets numbered from zero.

There may be hubs present with no cpu's enabled.
There may be disabled sockets such that the active socket is not
number zero.

And assign a 'socket master' by assuming that a socket is a
node. (it is not safe to extract socket number from an apicid)

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aW-9S@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 49 ++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 295a41122da..ab929e97650 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1547,11 +1547,13 @@ calculate_destination_timeout(void)
  */
 static void uv_init_per_cpu(int nuvhubs)
 {
-	int i, j, k;
+	int i;
 	int cpu;
 	int pnode;
 	int uvhub;
 	short socket = 0;
+	unsigned short socket_mask;
+	unsigned int uvhub_mask;
 	struct bau_control *bcp;
 	struct uvhub_desc *bdp;
 	struct socket_desc *sdp;
@@ -1562,7 +1564,7 @@ static void uv_init_per_cpu(int nuvhubs)
 		short cpu_number[16];
 	};
 	struct uvhub_desc {
-		short num_sockets;
+		unsigned short socket_mask;
 		short num_cpus;
 		short uvhub;
 		short pnode;
@@ -1581,43 +1583,54 @@ static void uv_init_per_cpu(int nuvhubs)
 		spin_lock_init(&bcp->masks_lock);
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		uvhub_mask |= (1 << uvhub);
 		bdp = &uvhub_descs[uvhub];
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
-		/* kludge: assume uv_hub.h is constant */
-		socket = (cpu_physical_id(cpu)>>5)&1;
-		if (socket >= bdp->num_sockets)
-			bdp->num_sockets = socket+1;
+		/* kludge: 'assuming' one node per socket, and assuming that
+		   disabling a socket just leaves a gap in node numbers */
+		socket = (cpu_to_node(cpu) & 1);;
+		bdp->socket_mask |= (1 << socket);
 		sdp = &bdp->socket[socket];
 		sdp->cpu_number[sdp->num_cpus] = cpu;
 		sdp->num_cpus++;
 	}
-	socket = 0;
-	for_each_possible_blade(uvhub) {
+	uvhub = 0;
+	while (uvhub_mask) {
+		if (!(uvhub_mask & 1))
+			goto nexthub;
 		bdp = &uvhub_descs[uvhub];
-		for (i = 0; i < bdp->num_sockets; i++) {
-			sdp = &bdp->socket[i];
-			for (j = 0; j < sdp->num_cpus; j++) {
-				cpu = sdp->cpu_number[j];
+		socket_mask = bdp->socket_mask;
+		socket = 0;
+		while (socket_mask) {
+			if (!(socket_mask & 1))
+				goto nextsocket;
+			sdp = &bdp->socket[socket];
+			for (i = 0; i < sdp->num_cpus; i++) {
+				cpu = sdp->cpu_number[i];
 				bcp = &per_cpu(bau_control, cpu);
 				bcp->cpu = cpu;
-				if (j == 0) {
+				if (i == 0) {
 					smaster = bcp;
-					if (i == 0)
+					if (socket == 0)
 						hmaster = bcp;
 				}
 				bcp->cpus_in_uvhub = bdp->num_cpus;
 				bcp->cpus_in_socket = sdp->num_cpus;
 				bcp->socket_master = smaster;
+				bcp->uvhub = bdp->uvhub;
 				bcp->uvhub_master = hmaster;
-				for (k = 0; k < DEST_Q_SIZE; k++)
-					bcp->socket_acknowledge_count[k] = 0;
-				bcp->uvhub_cpu =
-				  uv_cpu_hub_info(cpu)->blade_processor_id;
+				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
+						blade_processor_id;
 			}
+nextsocket:
 			socket++;
+			socket_mask = (socket_mask >> 1);
 		}
+nexthub:
+		uvhub++;
+		uvhub_mask = (uvhub_mask >> 1);
 	}
 	kfree(uvhub_descs);
 	for_each_present_cpu(cpu) {
-- 
cgit v1.2.3-70-g09d2


From 90cc7d944981a6d06b49bb26fde1b490e28c90e5 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Remove BAU check for stay-busy

Remove a faulty assumption that a long running BAU request has
encountered a hardware problem and will never finish.

Numalink congestion can make a request appear to have
encountered such a problem, but it is not safe to cancel the
request.  If such a cancel is done but a reply is later received
we can miss a TLB shootdown.

We depend upon the max_bau_concurrent 'throttle' to prevent the
stay-busy case from happening.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004ad-BV@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 -
 arch/x86/kernel/tlb_uv.c         | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 1c8f1e9bf74..c19b870ea58 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -402,7 +402,6 @@ struct bau_control {
 	unsigned short uvhub_quiesce;
 	short socket_acknowledge_count[DEST_Q_SIZE];
 	cycles_t send_message;
-	spinlock_t masks_lock;
 	spinlock_t uvhub_lock;
 	spinlock_t queue_lock;
 	/* tunables */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ab929e97650..dc962b5ac87 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -405,12 +405,10 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mmr;
 	unsigned long mask;
 	cycles_t ttime;
-	cycles_t timeout_time;
 	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster;
 
 	hmaster = bcp->uvhub_master;
-	timeout_time = get_cycles() + bcp->timeout_interval;
 
 	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
@@ -450,26 +448,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 			 * descriptor_status is still BUSY
 			 */
 			cpu_relax();
-			relaxes++;
-			if (relaxes >= 10000) {
-				relaxes = 0;
-				if (get_cycles() > timeout_time) {
-					quiesce_local_uvhub(hmaster);
-
-					/* single-thread the register change */
-					spin_lock(&hmaster->masks_lock);
-					mmr = uv_read_local_mmr(mmr_offset);
-					mask = 0UL;
-					mask |= (3UL < right_shift);
-					mask = ~mask;
-					mmr &= mask;
-					uv_write_local_mmr(mmr_offset, mmr);
-					spin_unlock(&hmaster->masks_lock);
-					end_uvhub_quiesce(hmaster);
-					stat->s_busy++;
-					return FLUSH_GIVEUP;
-				}
-			}
 		}
 	}
 	bcp->conseccompletes++;
@@ -1580,7 +1558,6 @@ static void uv_init_per_cpu(int nuvhubs)
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
-		spin_lock_init(&bcp->masks_lock);
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
 		uvhub_mask |= (1 << uvhub);
-- 
cgit v1.2.3-70-g09d2


From 7fba1bcd4844a4a8619a03bf51cabc92aea365a8 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Correct BAU regular message type

The Broadcast Assist Unit messages have a regular or retry
message type. The regular type was not being set, but needs to
be, because the lack of a message type is sometimes used to
identify an unused entry in the message queue.
Also removing some excess comments.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004ak-Dy@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index dc962b5ac87..4cb14dbd7fa 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -580,23 +580,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	}
 	time1 = get_cycles();
 	do {
-		/*
-		 * Every message from any given cpu gets a unique message
-		 * sequence number. But retries use that same number.
-		 * Our message may have timed out at the destination because
-		 * all sw-ack resources are in use and there is a timeout
-		 * pending there.  In that case, our last send never got
-		 * placed into the queue and we need to persist until it
-		 * does.
-		 *
-		 * Make any retry a type MSG_RETRY so that the destination will
-		 * free any resource held by a previous message from this cpu.
-		 */
 		if (try == 0) {
-			/* use message type set by the caller the first time */
+			bau_desc->header.msg_type = MSG_REGULAR;
 			seq_number = bcp->message_number++;
 		} else {
-			/* use RETRY type on all the rest; same sequence */
 			bau_desc->header.msg_type = MSG_RETRY;
 			stat->s_retry_messages++;
 		}
-- 
cgit v1.2.3-70-g09d2


From 450a007eebaf430426ea8f89bbc3f287949905b2 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: BAU broadcast to the local hub

Make the Broadcast Assist Unit driver use the BAU for TLB
shootdowns of cpu's on the local uvhub.

It was previously thought that IPI might be faster to the cpu's
on the local hub.  But the IPI operation would have to follow
the completion of the BAU broadcast anyway.  So we broadcast to
the local uvhub in all cases except when the current cpu was the
only local cpu in the mask.

This simplifies uv_flush_send_and_wait() in that it returns
either all shootdowns complete, or none.

Adjust the statistics to account for shootdowns on the local
uvhub.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aq-G7@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |   5 ++
 arch/x86/kernel/tlb_uv.c         | 138 +++++++++++++++------------------------
 2 files changed, 58 insertions(+), 85 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index c19b870ea58..7f6ea611cb7 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -346,6 +346,11 @@ struct ptc_stats {
 	unsigned long s_time; /* time spent in sending side */
 	unsigned long s_retriesok; /* successful retries */
 	unsigned long s_ntargcpu; /* total number of cpu's targeted */
+	unsigned long s_ntargself; /* times the sending cpu was targeted */
+	unsigned long s_ntarglocals; /* targets of cpus on the local blade */
+	unsigned long s_ntargremotes; /* targets of cpus on remote blades */
+	unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+	unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
 	unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
 	unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
 	unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 4cb14dbd7fa..a1615058fad 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -400,10 +400,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mmr_offset, int right_shift, int this_cpu,
 	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-	int relaxes = 0;
 	unsigned long descriptor_status;
-	unsigned long mmr;
-	unsigned long mask;
 	cycles_t ttime;
 	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster;
@@ -524,25 +521,19 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
  * The flush_mask contains the cpus the broadcast is to be sent to, plus
  * cpus that are on the local uvhub.
  *
- * Returns NULL if all flushing represented in the mask was done. The mask
- * is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
  */
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
-					     struct cpumask *flush_mask,
-					     struct bau_control *bcp)
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
+			   struct cpumask *flush_mask, struct bau_control *bcp)
 {
 	int right_shift;
-	int uvhub;
-	int bit;
 	int completion_status = 0;
 	int seq_number = 0;
 	long try = 0;
 	int cpu = bcp->uvhub_cpu;
 	int this_cpu = bcp->cpu;
-	int this_uvhub = bcp->uvhub;
 	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
@@ -552,10 +543,6 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
-	/*
-	 * Spin here while there are hmaster->max_bau_concurrent or more active
-	 * descriptors. This is the per-uvhub 'throttle'.
-	 */
 	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
 			hmaster->max_bau_concurrent)) {
@@ -591,9 +578,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
 			bcp->uvhub_cpu;
 		bcp->send_message = get_cycles();
-
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
 		try++;
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
 			right_shift, this_cpu, bcp, smaster, try);
@@ -652,16 +637,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	    (hmaster->max_bau_concurrent <
 					hmaster->max_bau_concurrent_constant))
 			hmaster->max_bau_concurrent++;
-
-	/*
-	 * hold any cpu not timing out here; no other cpu currently held by
-	 * the 'throttle' should enter the activation code
-	 */
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 	atomic_dec(&hmaster->active_descriptor_count);
-
-	/* guard against cycles wrap */
 	if (time2 > time1) {
 		elapsed = time2 - time1;
 		stat->s_time += elapsed;
@@ -674,32 +652,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			}
 		}
 	} else
-		stat->s_requestor--; /* don't count this one */
+		stat->s_requestor--;
 	if (completion_status == FLUSH_COMPLETE && try > 1)
 		stat->s_retriesok++;
 	else if (completion_status == FLUSH_GIVEUP) {
-		/*
-		 * Cause the caller to do an IPI-style TLB shootdown on
-		 * the target cpu's, all of which are still in the mask.
-		 */
 		stat->s_giveup++;
-		return flush_mask;
+		return 1;
 	}
-
-	/*
-	 * Success, so clear the remote cpu's from the mask so we don't
-	 * use the IPI method of shootdown on them.
-	 */
-	for_each_cpu(bit, flush_mask) {
-		uvhub = uv_cpu_to_blade_id(bit);
-		if (uvhub == this_uvhub)
-			continue;
-		cpumask_clear_cpu(bit, flush_mask);
-	}
-	if (!cpumask_empty(flush_mask))
-		return flush_mask;
-
-	return NULL;
+	return 0;
 }
 
 /**
@@ -731,10 +691,11 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	int remotes;
 	int tcpu;
 	int uvhub;
 	int locals = 0;
+	int remotes = 0;
+	int hubs = 0;
 	struct bau_desc *bau_desc;
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
@@ -768,54 +729,52 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	/*
 	 * Each sending cpu has a per-cpu mask which it fills from the caller's
-	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+	 * cpu mask.  All cpus are converted to uvhubs and copied to the
+	 * activation descriptor.
 	 */
 	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
-	/*
-	 * copy cpumask to flush_mask, removing current cpu
-	 * (current cpu should already have been flushed by the caller and
-	 *  should never be returned if we return flush_mask)
-	 */
+	/* don't actually do a shootdown of the local cpu */
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 	if (cpu_isset(cpu, *cpumask))
-		locals++;  /* current cpu was targeted */
+		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-	remotes = 0;
+
+	/* cpu statistics */
 	for_each_cpu(tcpu, flush_mask) {
 		uvhub = uv_cpu_to_blade_id(tcpu);
-		if (uvhub == bcp->uvhub) {
-			locals++;
-			continue;
-		}
 		bau_uvhub_set(uvhub, &bau_desc->distribution);
-		remotes++;
-	}
-	if (remotes == 0) {
-		/*
-		 * No off_hub flushing; return status for local hub.
-		 * Return the caller's mask if all were local (the current
-		 * cpu may be in that mask).
-		 */
-		if (locals)
-			return cpumask;
+		if (uvhub == bcp->uvhub)
+			locals++;
 		else
-			return NULL;
+			remotes++;
 	}
+	if ((locals + remotes) == 0)
+		return NULL;
 	stat->s_requestor++;
-	stat->s_ntargcpu += remotes;
+	stat->s_ntargcpu += remotes + locals;
+	stat->s_ntargremotes += remotes;
+	stat->s_ntarglocals += locals;
 	remotes = bau_uvhub_weight(&bau_desc->distribution);
-	stat->s_ntarguvhub += remotes;
-	if (remotes >= 16)
+
+	/* uvhub statistics */
+	hubs = bau_uvhub_weight(&bau_desc->distribution);
+	if (locals) {
+		stat->s_ntarglocaluvhub++;
+		stat->s_ntargremoteuvhub += (hubs - 1);
+	} else
+		stat->s_ntargremoteuvhub += hubs;
+	stat->s_ntarguvhub += hubs;
+	if (hubs >= 16)
 		stat->s_ntarguvhub16++;
-	else if (remotes >= 8)
+	else if (hubs >= 8)
 		stat->s_ntarguvhub8++;
-	else if (remotes >= 4)
+	else if (hubs >= 4)
 		stat->s_ntarguvhub4++;
-	else if (remotes >= 2)
+	else if (hubs >= 2)
 		stat->s_ntarguvhub2++;
 	else
 		stat->s_ntarguvhub1++;
@@ -824,10 +783,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	bau_desc->payload.sending_cpu = cpu;
 
 	/*
-	 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
-	 * the adjusted flush_mask if any cpu's were not messaged.
+	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+	 * or 1 if it gave up and the original cpumask should be returned.
 	 */
-	return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
+	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+		return NULL;
+	else
+		return cpumask;
 }
 
 /*
@@ -976,9 +938,11 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 	if (!cpu) {
 		seq_printf(file,
-			"# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
+			"# cpu sent stime self locals remotes ncpus localhub ");
+		seq_printf(file,
+			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-			"numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
 		seq_printf(file,
 			"retries rok resetp resett giveup sto bz throt ");
 		seq_printf(file,
@@ -994,10 +958,14 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
-			   stat->s_ntarguvhub, stat->s_ntarguvhub16,
+			   stat->s_ntargself, stat->s_ntarglocals,
+			   stat->s_ntargremotes, stat->s_ntargcpu,
+			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
+		seq_printf(file, "%ld %ld %ld %ld %ld ",
 			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
 			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
-			   stat->s_ntargcpu, stat->s_dtimeout);
+			   stat->s_dtimeout);
 		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
 			   stat->s_retry_messages, stat->s_retriesok,
 			   stat->s_resets_plug, stat->s_resets_timeout,
-- 
cgit v1.2.3-70-g09d2


From f6d8a56693426b1f29ff5cafda8be0d65e4e1870 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 2 Jun 2010 16:22:02 -0500
Subject: x86, UV: Modularize BAU send and wait

Streamline the large uv_flush_send_and_wait() function by use of
a couple of helper functions.

And remove some excess comments.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004ay-IH@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 -
 arch/x86/kernel/tlb_uv.c         | 82 +++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 7f6ea611cb7..42d412fd8b0 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -75,7 +75,6 @@
 #define DESC_STATUS_DESTINATION_TIMEOUT	2
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
-#define TIMEOUT_DELAY			10
 /*
  * delay for 'plugged' timeout retries, in microseconds
  */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index a1615058fad..abf3c31f14c 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -484,6 +484,47 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
 	return 1;
 }
 
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+			struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	udelay(bcp->plugged_delay);
+	bcp->plugged_tries++;
+	if (bcp->plugged_tries >= bcp->plugsb4reset) {
+		bcp->plugged_tries = 0;
+		quiesce_local_uvhub(hmaster);
+		spin_lock(&hmaster->queue_lock);
+		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		spin_unlock(&hmaster->queue_lock);
+		end_uvhub_quiesce(hmaster);
+		bcp->ipi_attempts++;
+		stat->s_resets_plug++;
+	}
+}
+
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+			struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	hmaster->max_bau_concurrent = 1;
+	bcp->timeout_tries++;
+	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+		bcp->timeout_tries = 0;
+		quiesce_local_uvhub(hmaster);
+		spin_lock(&hmaster->queue_lock);
+		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		spin_unlock(&hmaster->queue_lock);
+		end_uvhub_quiesce(hmaster);
+		bcp->ipi_attempts++;
+		stat->s_resets_timeout++;
+	}
+}
+
 /*
  * Completions are taking a very long time due to a congested numalink
  * network.
@@ -518,7 +559,7 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
  *
  * Send a broadcast and wait for it to complete.
  *
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * The flush_mask contains the cpus the broadcast is to be sent to including
  * cpus that are on the local uvhub.
  *
  * Returns 0 if all flushing represented in the mask was done.
@@ -553,7 +594,6 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			&hmaster->active_descriptor_count,
 			hmaster->max_bau_concurrent));
 	}
-
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 
@@ -584,40 +624,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			right_shift, this_cpu, bcp, smaster, try);
 
 		if (completion_status == FLUSH_RETRY_PLUGGED) {
-			/*
-			 * Our retries may be blocked by all destination swack
-			 * resources being consumed, and a timeout pending. In
-			 * that case hardware immediately returns the ERROR
-			 * that looks like a destination timeout.
-			 */
-			udelay(bcp->plugged_delay);
-			bcp->plugged_tries++;
-			if (bcp->plugged_tries >= bcp->plugsb4reset) {
-				bcp->plugged_tries = 0;
-				quiesce_local_uvhub(hmaster);
-				spin_lock(&hmaster->queue_lock);
-				uv_reset_with_ipi(&bau_desc->distribution,
-							this_cpu);
-				spin_unlock(&hmaster->queue_lock);
-				end_uvhub_quiesce(hmaster);
-				bcp->ipi_attempts++;
-				stat->s_resets_plug++;
-			}
+			destination_plugged(bau_desc, bcp, hmaster, stat);
 		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-			hmaster->max_bau_concurrent = 1;
-			bcp->timeout_tries++;
-			udelay(TIMEOUT_DELAY);
-			if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
-				bcp->timeout_tries = 0;
-				quiesce_local_uvhub(hmaster);
-				spin_lock(&hmaster->queue_lock);
-				uv_reset_with_ipi(&bau_desc->distribution,
-								this_cpu);
-				spin_unlock(&hmaster->queue_lock);
-				end_uvhub_quiesce(hmaster);
-				bcp->ipi_attempts++;
-				stat->s_resets_timeout++;
-			}
+			destination_timeout(bau_desc, bcp, hmaster, stat);
 		}
 		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
 			bcp->ipi_attempts = 0;
@@ -628,10 +637,8 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
 		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
-
 	bcp->plugged_tries = 0;
 	bcp->timeout_tries = 0;
-
 	if ((completion_status == FLUSH_COMPLETE) &&
 	    (bcp->conseccompletes > bcp->complete_threshold) &&
 	    (hmaster->max_bau_concurrent <
@@ -740,7 +747,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	bau_desc = bcp->descriptor_base;
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 
 	/* cpu statistics */
-- 
cgit v1.2.3-70-g09d2


From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 19 May 2010 21:35:17 +0200
Subject: x86: Unify dumpstack.h and stacktrace.h

arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h
declare headers of objects that deal with the same topic.
Actually most of the files that include stacktrace.h also include
dumpstack.h

Although dumpstack.h seems more reserved for internals of stack
traces, those are quite often needed to define specialized stack
trace operations. And perf event arch headers are going to need
access to such low level operations anyway. So don't continue to
bother with dumpstack.h as it's not anymore about isolated deep
internals.

v2: fix struct stack_frame definition conflict in sysprof

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Soeren Sandmann <sandmann@daimi.au.dk>
---
 arch/x86/include/asm/stacktrace.h | 52 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.c  |  2 --
 arch/x86/kernel/dumpstack.c       |  1 -
 arch/x86/kernel/dumpstack.h       | 56 ---------------------------------------
 arch/x86/kernel/dumpstack_32.c    |  2 --
 arch/x86/kernel/dumpstack_64.c    |  1 -
 arch/x86/kernel/stacktrace.c      |  7 ++---
 kernel/trace/trace_sysprof.c      |  7 ++---
 8 files changed, 60 insertions(+), 68 deletions(-)
 delete mode 100644 arch/x86/kernel/dumpstack.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 4dab78edbad..a957463d3c7 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,6 +1,13 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
 #ifndef _ASM_X86_STACKTRACE_H
 #define _ASM_X86_STACKTRACE_H
 
+#include <linux/uaccess.h>
+
 extern int kstack_depth_to_print;
 
 struct thread_info;
@@ -42,4 +49,49 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data);
 
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+struct stack_frame_ia32 {
+    u32 next_frame;
+    u32 return_address;
+};
+
+static inline unsigned long rewind_frame_pointer(int n)
+{
+	struct stack_frame *frame;
+
+	get_bp(frame);
+
+#ifdef CONFIG_FRAME_POINTER
+	while (n--) {
+		if (probe_kernel_address(&frame->next_frame, frame))
+			break;
+	}
+#endif
+
+	return (unsigned long)frame;
+}
+
 #endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c77586061bc..9632fb61e8f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1585,8 +1585,6 @@ static const struct stacktrace_ops backtrace_ops = {
 	.walk_stack		= print_context_stack_bp,
 };
 
-#include "../dumpstack.h"
-
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b..6e8752c1bd5 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
 
 #include <asm/stacktrace.h>
 
-#include "dumpstack.h"
 
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd4..00000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-#include <linux/uaccess.h>
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
-};
-
-struct stack_frame_ia32 {
-    u32 next_frame;
-    u32 return_address;
-};
-
-static inline unsigned long rewind_frame_pointer(int n)
-{
-	struct stack_frame *frame;
-
-	get_bp(frame);
-
-#ifdef CONFIG_FRAME_POINTER
-	while (n--) {
-		if (probe_kernel_address(&frame->next_frame, frame))
-			break;
-	}
-#endif
-
-	return (unsigned long)frame;
-}
-
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d9..0f6376ffa2d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
 
 #include <asm/stacktrace.h>
 
-#include "dumpstack.h"
-
 
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f..57a21f11c79 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
 
 #include <asm/stacktrace.h>
 
-#include "dumpstack.h"
 
 #define N_EXCEPTION_STACKS_END \
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6..ea54d029fe2 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
 
-struct stack_frame {
+struct stack_frame_user {
 	const void __user	*next_fp;
 	unsigned long		ret_addr;
 };
 
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 {
 	int ret;
 
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = regs->ip;
 
 	while (trace->nr_entries < trace->max_entries) {
-		struct stack_frame frame;
+		struct stack_frame_user frame;
 
 		frame.next_fp = NULL;
 		frame.ret_addr = 0;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index a7974a552ca..c080956f4d8 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -33,12 +33,13 @@ static DEFINE_MUTEX(sample_timer_lock);
  */
 static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
 
-struct stack_frame {
+struct stack_frame_user {
 	const void __user	*next_fp;
 	unsigned long		return_address;
 };
 
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 {
 	int ret;
 
@@ -125,7 +126,7 @@ trace_kernel(struct pt_regs *regs, struct trace_array *tr,
 static void timer_notify(struct pt_regs *regs, int cpu)
 {
 	struct trace_array_cpu *data;
-	struct stack_frame frame;
+	struct stack_frame_user frame;
 	struct trace_array *tr;
 	const void __user *fp;
 	int is_user;
-- 
cgit v1.2.3-70-g09d2


From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 20 May 2010 07:47:21 +0200
Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller

Drop this argument now that we always want to rewind only to the
state of the first caller.
It means frame pointers are not necessary anymore to reliably get
the source of an event. But this also means we need this helper
to be a macro now, as an inline function is not an option since
we need to know when to provide a default implentation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/powerpc/include/asm/perf_event.h | 12 ++++++++++++
 arch/powerpc/kernel/misc.S            | 26 --------------------------
 arch/sparc/include/asm/perf_event.h   |  8 ++++++++
 arch/sparc/kernel/helpers.S           |  6 +++---
 arch/x86/include/asm/perf_event.h     | 13 +++++++++++++
 arch/x86/include/asm/stacktrace.h     |  7 ++-----
 arch/x86/kernel/cpu/perf_event.c      | 16 ----------------
 include/linux/perf_event.h            | 32 +++++++-------------------------
 include/trace/ftrace.h                |  2 +-
 kernel/perf_event.c                   |  5 -----
 kernel/trace/trace_event_perf.c       |  2 --
 11 files changed, 46 insertions(+), 83 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index e6d4ce69b12..5c16b891d50 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -21,3 +21,15 @@
 #ifdef CONFIG_FSL_EMB_PERF_EVENT
 #include <asm/perf_event_fsl_emb.h>
 #endif
+
+#ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+
+#define perf_arch_fetch_caller_regs(regs, __ip)			\
+	do {							\
+		(regs)->nip = __ip;				\
+		(regs)->gpr[1] = *(unsigned long *)__get_SP();	\
+		asm volatile("mfmsr %0" : "=r" ((regs)->msr));	\
+	} while (0)
+#endif
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 22e507c8a55..2d29752cbe1 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -127,29 +127,3 @@ _GLOBAL(__setup_cpu_power7)
 _GLOBAL(__restore_cpu_power7)
 	/* place holder */
 	blr
-
-/*
- * Get a minimal set of registers for our caller's nth caller.
- * r3 = regs pointer, r5 = n.
- *
- * We only get R1 (stack pointer), NIP (next instruction pointer)
- * and LR (link register).  These are all we can get in the
- * general case without doing complicated stack unwinding, but
- * fortunately they are enough to do a stack backtrace, which
- * is all we need them for.
- */
-_GLOBAL(perf_arch_fetch_caller_regs)
-	mr	r6,r1
-	cmpwi	r5,0
-	mflr	r4
-	ble	2f
-	mtctr	r5
-1:	PPC_LL	r6,0(r6)
-	bdnz	1b
-	PPC_LL	r4,PPC_LR_STKOFF(r6)
-2:	PPC_LL	r7,0(r6)
-	PPC_LL	r7,PPC_LR_STKOFF(r7)
-	PPC_STL	r6,GPR1-STACK_FRAME_OVERHEAD(r3)
-	PPC_STL	r4,_NIP-STACK_FRAME_OVERHEAD(r3)
-	PPC_STL	r7,_LINK-STACK_FRAME_OVERHEAD(r3)
-	blr
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
index 7e2669894ce..74c4e0cd889 100644
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -6,7 +6,15 @@ extern void set_perf_event_pending(void);
 #define	PERF_EVENT_INDEX_OFFSET	0
 
 #ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+
 extern void init_hw_perf_events(void);
+
+extern void
+__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+
+#define perf_arch_fetch_caller_regs(pt_regs, ip)	\
+	__perf_arch_fetch_caller_regs(pt_regs, ip, 1);
 #else
 static inline void init_hw_perf_events(void)	{ }
 #endif
diff --git a/arch/sparc/kernel/helpers.S b/arch/sparc/kernel/helpers.S
index 92090cc9e82..682fee06a16 100644
--- a/arch/sparc/kernel/helpers.S
+++ b/arch/sparc/kernel/helpers.S
@@ -47,9 +47,9 @@ stack_trace_flush:
 	.size		stack_trace_flush,.-stack_trace_flush
 
 #ifdef CONFIG_PERF_EVENTS
-	.globl		perf_arch_fetch_caller_regs
-	.type		perf_arch_fetch_caller_regs,#function
-perf_arch_fetch_caller_regs:
+	.globl		__perf_arch_fetch_caller_regs
+	.type		__perf_arch_fetch_caller_regs,#function
+__perf_arch_fetch_caller_regs:
 	/* We always read the %pstate into %o5 since we will use
 	 * that to construct a fake %tstate to store into the regs.
 	 */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e..02de29830ff 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)		{	\
+	(regs)->ip = (__ip);					\
+	(regs)->bp = caller_frame_pointer();			\
+	(regs)->cs = __KERNEL_CS;				\
+	regs->flags = 0;					\
+}
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index a957463d3c7..2b16a2ad23d 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -78,17 +78,14 @@ struct stack_frame_ia32 {
     u32 return_address;
 };
 
-static inline unsigned long rewind_frame_pointer(int n)
+static inline unsigned long caller_frame_pointer(void)
 {
 	struct stack_frame *frame;
 
 	get_bp(frame);
 
 #ifdef CONFIG_FRAME_POINTER
-	while (n--) {
-		if (probe_kernel_address(&frame->next_frame, frame))
-			break;
-	}
+	frame = frame->next_frame;
 #endif
 
 	return (unsigned long)frame;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9632fb61e8f..2c075fe573d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-	regs->ip = ip;
-	/*
-	 * perf_arch_fetch_caller_regs adds another call, we need to increment
-	 * the skip level
-	 */
-	regs->bp = rewind_frame_pointer(skip + 1);
-	regs->cs = __KERNEL_CS;
-	/*
-	 * We abuse bit 3 to pass exact information, see perf_misc_flags
-	 * and the comment with PERF_EFLAGS_EXACT.
-	 */
-	regs->flags = 0;
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb6c91eac7e..bea785cef49 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -905,8 +905,10 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void
-perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+#ifndef perf_arch_fetch_caller_regs
+static inline void
+perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { }
+#endif
 
 /*
  * Take a snapshot of the regs. Skip ip and frame pointer to
@@ -916,31 +918,11 @@ perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
  * - bp for callchains
  * - eflags, for future purposes, just in case
  */
-static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
+static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	unsigned long ip;
-
 	memset(regs, 0, sizeof(*regs));
 
-	switch (skip) {
-	case 1 :
-		ip = CALLER_ADDR0;
-		break;
-	case 2 :
-		ip = CALLER_ADDR1;
-		break;
-	case 3 :
-		ip = CALLER_ADDR2;
-		break;
-	case 4:
-		ip = CALLER_ADDR3;
-		break;
-	/* No need to support further for now */
-	default:
-		ip = 0;
-	}
-
-	return perf_arch_fetch_caller_regs(regs, ip, skip);
+	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
 static inline void
@@ -950,7 +932,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 		struct pt_regs hot_regs;
 
 		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs, 1);
+			perf_fetch_caller_regs(&hot_regs);
 			regs = &hot_regs;
 		}
 		__perf_sw_event(event_id, nr, nmi, regs, addr);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 3d685d1f2a0..8ee8b6e6b25 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -705,7 +705,7 @@ perf_trace_##call(void *__data, proto)					\
 	int __data_size;						\
 	int rctx;							\
 									\
-	perf_fetch_caller_regs(&__regs, 1);				\
+	perf_fetch_caller_regs(&__regs);				\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e099650cd24..9ae4dbcdf46 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2851,11 +2851,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
-
 
 /*
  * We assume there is only KVM supporting the callbacks.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cb6f365016e..21db1d3a48d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-
 static char *perf_trace_buf[4];
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 17:49:05 +0200
Subject: perf: Cleanup {start,commit,cancel}_txn details

Clarify some of the transactional group scheduling API details
and change it so that a successfull ->commit_txn also closes
the transaction.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274803086.5882.1752.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_event.c |  7 ++++---
 arch/sparc/kernel/perf_event.c   |  7 ++++---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++---------
 include/linux/perf_event.h       | 27 ++++++++++++++++++++++-----
 kernel/perf_event.c              |  9 +--------
 5 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 43b83c35cf5..ac2a8c2554d 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuhw->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
@@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
 
@@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu)
 	for (i = cpuhw->n_txn_start; i < n; ++i)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 0ec92c8861d..beeb92fa3ac 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuc->event, n0, 1))
@@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
 /*
@@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
 	if (sparc_check_constraints(cpuc->event, cpuc->events, n))
 		return -EAGAIN;
 
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a1..af04c6fa59c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto out;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
 	x86_pmu_stop(event);
@@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
 
@@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	/*
 	 * Truncate the collected events.
 	 */
@@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	/*
-	 * Clear out the txn count so that ->cancel_txn() which gets
-	 * run after ->commit_txn() doesn't undo things.
-	 */
-	cpuc->n_txn = 0;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 
 	return 0;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 36efad90cd4..f1b6ba0770e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -549,7 +549,10 @@ struct hw_perf_event {
 
 struct perf_event;
 
-#define PERF_EVENT_TXN_STARTED 1
+/*
+ * Common implementation detail of pmu::{start,commit,cancel}_txn
+ */
+#define PERF_EVENT_TXN 0x1
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -563,14 +566,28 @@ struct pmu {
 	void (*unthrottle)		(struct perf_event *event);
 
 	/*
-	 * group events scheduling is treated as a transaction,
-	 * add group events as a whole and perform one schedulability test.
-	 * If test fails, roll back the whole group
+	 * Group events scheduling is treated as a transaction, add group
+	 * events as a whole and perform one schedulability test. If the test
+	 * fails, roll back the whole group
 	 */
 
+	/*
+	 * Start the transaction, after this ->enable() doesn't need
+	 * to do schedulability tests.
+	 */
 	void (*start_txn)	(const struct pmu *pmu);
-	void (*cancel_txn)	(const struct pmu *pmu);
+	/*
+	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * then ->commit_txn() is required to perform one. On success
+	 * the transaction is closed. On error the transaction is kept
+	 * open until ->cancel_txn() is called.
+	 */
 	int  (*commit_txn)	(const struct pmu *pmu);
+	/*
+	 * Will cancel the transaction, assumes ->disable() is called for
+	 * each successfull ->enable() during the transaction.
+	 */
+	void (*cancel_txn)	(const struct pmu *pmu);
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 227ed9c8ec3..6f60920772b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
 	struct perf_event *event, *partial_group = NULL;
 	const struct pmu *pmu = group_event->pmu;
 	bool txn = false;
-	int ret;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -703,15 +702,9 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!txn)
+	if (!txn || !pmu->commit_txn(pmu))
 		return 0;
 
-	ret = pmu->commit_txn(pmu);
-	if (!ret) {
-		pmu->cancel_txn(pmu);
-		return 0;
-	}
-
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
-- 
cgit v1.2.3-70-g09d2


From 68aa00ac0a82e9a876c799bf6be7622b8f1c8517 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 3 Jun 2010 01:23:04 +0400
Subject: perf, x86: Make a second write to performance counter if needed

On Netburst PMU we need a second write to a performance counter
due to cpu erratum.

A simple flag test instead of alternative instructions was choosen
because wrmsrl is already a macro and if virtualization is turned
on will need an additional wrapper call which is more expencise.

nb: we should propably switch to jump-labels as only this facility
reach the mainline.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100602212304.GC5264@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 12 +++++++++++-
 arch/x86/kernel/cpu/perf_event_p4.c |  9 +++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index af04c6fa59c..79e199843db 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -220,6 +220,7 @@ struct x86_pmu {
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
 	void		(*quirks)(void);
+	int		perfctr_second_write;
 
 	int		(*cpu_prepare)(int cpu);
 	void		(*cpu_starting)(int cpu);
@@ -925,8 +926,17 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsrl(hwc->event_base + idx,
+	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+
+	/*
+	 * Due to erratum on certan cpu we need
+	 * a second write to be sure the register
+	 * is updated properly
+	 */
+	if (x86_pmu.perfctr_second_write) {
+		wrmsrl(hwc->event_base + idx,
 			(u64)(-left) & x86_pmu.cntval_mask);
+	}
 
 	perf_event_update_userpage(event);
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d..9286e736a70 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -829,6 +829,15 @@ static __initconst const struct x86_pmu p4_pmu = {
 	.max_period		= (1ULL << 39) - 1,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
+	/*
+	 * This handles erratum N15 in intel doc 249199-029,
+	 * the counter may not be updated correctly on write
+	 * so we need a second write operation to do the trick
+	 * (the official workaround didn't work)
+	 *
+	 * the former idea is taken from OProfile code
+	 */
+	.perfctr_second_write	= 1,
 };
 
 static __init int p4_pmu_init(void)
-- 
cgit v1.2.3-70-g09d2


From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:43:08 +0200
Subject: perf: Convert perf_event to local_t

Since now all modification to event->count (and ->prev_count
and ->period_left) are local to a cpu, change then to local64_t so we
avoid the LOCK'ed ops.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/perf_event.c     | 18 ++++++++---------
 arch/powerpc/kernel/perf_event.c | 34 ++++++++++++++++----------------
 arch/sh/kernel/perf_event.c      |  6 +++---
 arch/sparc/kernel/perf_event.c   | 18 ++++++++---------
 arch/x86/kernel/cpu/perf_event.c | 18 ++++++++---------
 include/linux/perf_event.h       |  7 ++++---
 kernel/perf_event.c              | 42 ++++++++++++++++++++--------------------
 7 files changed, 72 insertions(+), 71 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index c45768614c8..5b7cfafc072 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -164,20 +164,20 @@ armpmu_event_set_period(struct perf_event *event,
 			struct hw_perf_event *hwc,
 			int idx)
 {
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
 
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -185,7 +185,7 @@ armpmu_event_set_period(struct perf_event *event,
 	if (left > (s64)armpmu->max_period)
 		left = armpmu->max_period;
 
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	armpmu->write_counter(idx, (u64)(-left) & 0xffffffff);
 
@@ -204,18 +204,18 @@ armpmu_event_update(struct perf_event *event,
 	s64 delta;
 
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = armpmu->read_counter(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -478,7 +478,7 @@ __hw_perf_event_init(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period  = armpmu->max_period;
 		hwc->last_period    = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
 	err = 0;
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index ac2a8c2554d..af1d9a7c65d 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -410,15 +410,15 @@ static void power_pmu_read(struct perf_event *event)
 	 * Therefore we treat them like NMIs.
 	 */
 	do {
-		prev = atomic64_read(&event->hw.prev_count);
+		prev = local64_read(&event->hw.prev_count);
 		barrier();
 		val = read_pmc(event->hw.idx);
-	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
+	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
 
 	/* The counters are only 32 bits wide */
 	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &event->hw.period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &event->hw.period_left);
 }
 
 /*
@@ -444,10 +444,10 @@ static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
 		if (!event->hw.idx)
 			continue;
 		val = (event->hw.idx == 5) ? pmc5 : pmc6;
-		prev = atomic64_read(&event->hw.prev_count);
+		prev = local64_read(&event->hw.prev_count);
 		event->hw.idx = 0;
 		delta = (val - prev) & 0xfffffffful;
-		atomic64_add(delta, &event->count);
+		local64_add(delta, &event->count);
 	}
 }
 
@@ -462,7 +462,7 @@ static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
 		event = cpuhw->limited_counter[i];
 		event->hw.idx = cpuhw->limited_hwidx[i];
 		val = (event->hw.idx == 5) ? pmc5 : pmc6;
-		atomic64_set(&event->hw.prev_count, val);
+		local64_set(&event->hw.prev_count, val);
 		perf_event_update_userpage(event);
 	}
 }
@@ -666,11 +666,11 @@ void hw_perf_enable(void)
 		}
 		val = 0;
 		if (event->hw.sample_period) {
-			left = atomic64_read(&event->hw.period_left);
+			left = local64_read(&event->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
 		}
-		atomic64_set(&event->hw.prev_count, val);
+		local64_set(&event->hw.prev_count, val);
 		event->hw.idx = idx;
 		write_pmc(idx, val);
 		perf_event_update_userpage(event);
@@ -842,8 +842,8 @@ static void power_pmu_unthrottle(struct perf_event *event)
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
 	write_pmc(event->hw.idx, val);
-	atomic64_set(&event->hw.prev_count, val);
-	atomic64_set(&event->hw.period_left, left);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
 	perf_enable();
 	local_irq_restore(flags);
@@ -1109,7 +1109,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	event->hw.config = events[n];
 	event->hw.event_base = cflags[n];
 	event->hw.last_period = event->hw.sample_period;
-	atomic64_set(&event->hw.period_left, event->hw.last_period);
+	local64_set(&event->hw.period_left, event->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1147,16 +1147,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	int record = 0;
 
 	/* we don't have to worry about interrupts here */
-	prev = atomic64_read(&event->hw.prev_count);
+	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 
 	/*
 	 * See if the total period for this event has expired,
 	 * and update for the next period.
 	 */
 	val = 0;
-	left = atomic64_read(&event->hw.period_left) - delta;
+	left = local64_read(&event->hw.period_left) - delta;
 	if (period) {
 		if (left <= 0) {
 			left += period;
@@ -1194,8 +1194,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	}
 
 	write_pmc(event->hw.idx, val);
-	atomic64_set(&event->hw.prev_count, val);
-	atomic64_set(&event->hw.period_left, left);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
 }
 
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 81b6de41ae5..7a3dc356725 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -185,10 +185,10 @@ static void sh_perf_event_update(struct perf_event *event,
 	 * this is the simplest approach for maintaining consistency.
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = sh_pmu->read(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -203,7 +203,7 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static void sh_pmu_disable(struct perf_event *event)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index beeb92fa3ac..8a6660da8e0 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -572,18 +572,18 @@ static u64 sparc_perf_event_update(struct perf_event *event,
 	s64 delta;
 
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = read_pmc(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -591,27 +591,27 @@ again:
 static int sparc_perf_event_set_period(struct perf_event *event,
 				       struct hw_perf_event *hwc, int idx)
 {
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
 
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 	if (left > MAX_PERIOD)
 		left = MAX_PERIOD;
 
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	write_pmc(idx, (u64)(-left) & 0xffffffff);
 
@@ -1087,7 +1087,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = MAX_PERIOD;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79e199843db..2d0d2906927 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
 	 * count to the generic event atomically:
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	rdmsrl(hwc->event_base + idx, new_raw_count);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 					new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -314,8 +314,8 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	} else {
 		/*
 		 * If we have a PMU initialized but no APIC
@@ -886,7 +886,7 @@ static int
 x86_perf_event_set_period(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0, idx = hwc->idx;
 
@@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	 * The hw event starts counting from this event offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f34dab9b275..7342979f95f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -487,6 +487,7 @@ struct perf_guest_info_callbacks {
 #include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
+#include <asm/local64.h>
 
 #define PERF_MAX_STACK_DEPTH		255
 
@@ -536,10 +537,10 @@ struct hw_perf_event {
 		struct arch_hw_breakpoint	info;
 #endif
 	};
-	atomic64_t			prev_count;
+	local64_t			prev_count;
 	u64				sample_period;
 	u64				last_period;
-	atomic64_t			period_left;
+	local64_t			period_left;
 	u64				interrupts;
 
 	u64				freq_time_stamp;
@@ -670,7 +671,7 @@ struct perf_event {
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
-	atomic64_t			count;
+	local64_t			count;
 	atomic64_t			child_count;
 
 	/*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a395fda2d94..97c73018592 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1148,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * In order to keep per-task stats reliable we need to flip the event
 	 * values when we flip the contexts.
 	 */
-	value = atomic64_read(&next_event->count);
-	value = atomic64_xchg(&event->count, value);
-	atomic64_set(&next_event->count, value);
+	value = local64_read(&next_event->count);
+	value = local64_xchg(&event->count, value);
+	local64_set(&next_event->count, value);
 
 	swap(event->total_time_enabled, next_event->total_time_enabled);
 	swap(event->total_time_running, next_event->total_time_running);
@@ -1540,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 
 	hwc->sample_period = sample_period;
 
-	if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+	if (local64_read(&hwc->period_left) > 8*sample_period) {
 		perf_disable();
 		perf_event_stop(event);
-		atomic64_set(&hwc->period_left, 0);
+		local64_set(&hwc->period_left, 0);
 		perf_event_start(event);
 		perf_enable();
 	}
@@ -1584,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 
 		perf_disable();
 		event->pmu->read(event);
-		now = atomic64_read(&event->count);
+		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
 
@@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return atomic64_read(&event->count) + atomic64_read(&event->child_count);
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -2141,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_event_reset(struct perf_event *event)
 {
 	(void)perf_event_read(event);
-	atomic64_set(&event->count, 0);
+	local64_set(&event->count, 0);
 	perf_event_update_userpage(event);
 }
 
@@ -2359,7 +2359,7 @@ void perf_event_update_userpage(struct perf_event *event)
 	userpg->index = perf_event_index(event);
 	userpg->offset = perf_event_count(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		userpg->offset -= atomic64_read(&event->hw.prev_count);
+		userpg->offset -= local64_read(&event->hw.prev_count);
 
 	userpg->time_enabled = event->total_time_enabled +
 			atomic64_read(&event->child_total_time_enabled);
@@ -4035,14 +4035,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
 	hwc->last_period = hwc->sample_period;
 
 again:
-	old = val = atomic64_read(&hwc->period_left);
+	old = val = local64_read(&hwc->period_left);
 	if (val < 0)
 		return 0;
 
 	nr = div64_u64(period + val, period);
 	offset = nr * period;
 	val -= offset;
-	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
 		goto again;
 
 	return nr;
@@ -4081,7 +4081,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	atomic64_add(nr, &event->count);
+	local64_add(nr, &event->count);
 
 	if (!regs)
 		return;
@@ -4092,7 +4092,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 		return perf_swevent_overflow(event, 1, nmi, data, regs);
 
-	if (atomic64_add_negative(nr, &hwc->period_left))
+	if (local64_add_negative(nr, &hwc->period_left))
 		return;
 
 	perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4383,8 +4383,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
 	u64 now;
 
 	now = cpu_clock(cpu);
-	prev = atomic64_xchg(&event->hw.prev_count, now);
-	atomic64_add(now - prev, &event->count);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
 }
 
 static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4392,7 +4392,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int cpu = raw_smp_processor_id();
 
-	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	local64_set(&hwc->prev_count, cpu_clock(cpu));
 	perf_swevent_start_hrtimer(event);
 
 	return 0;
@@ -4424,9 +4424,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
 	u64 prev;
 	s64 delta;
 
-	prev = atomic64_xchg(&event->hw.prev_count, now);
+	prev = local64_xchg(&event->hw.prev_count, now);
 	delta = now - prev;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4436,7 +4436,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
 
 	now = event->ctx->time;
 
-	atomic64_set(&hwc->prev_count, now);
+	local64_set(&hwc->prev_count, now);
 
 	perf_swevent_start_hrtimer(event);
 
@@ -4879,7 +4879,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		hwc->sample_period = 1;
 	hwc->last_period = hwc->sample_period;
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
+	local64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -5313,7 +5313,7 @@ inherit_event(struct perf_event *parent_event,
 		hwc->sample_period = sample_period;
 		hwc->last_period   = sample_period;
 
-		atomic64_set(&hwc->period_left, sample_period);
+		local64_set(&hwc->period_left, sample_period);
 	}
 
 	child_event->overflow_handler = parent_event->overflow_handler;
-- 
cgit v1.2.3-70-g09d2


From 147ec4d2361e355ab32499f739cc24845ceb89da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 3 Jun 2010 21:32:39 +0200
Subject: x86: Make save_stack_address() !CONFIG_FRAME_POINTER friendly

If CONFIG_FRAME_POINTER=n, print_context_stack() shouldn't neglect the
non-reliable addresses on stack, this is all we have if dump_trace(bp)
is called with the wrong or zero bp.

For example, /proc/pid/stack doesn't work if CONFIG_FRAME_POINTER=n.

This patch obviously has no effect if CONFIG_FRAME_POINTER=y, otherwise
it reverts 1650743c "x86: don't save unreliable stack trace entries".

Also, remove the unnecessary type-cast.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20100603193239.GA31530@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/stacktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index ea54d029fe2..abc321d5587 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -26,8 +26,10 @@ static int save_stack_stack(void *data, char *name)
 static void save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
 		return;
+#endif
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
@@ -39,9 +41,11 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
-	struct stack_trace *trace = (struct stack_trace *)data;
+	struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
 		return;
+#endif
 	if (in_sched_functions(addr))
 		return;
 	if (trace->skip > 0) {
-- 
cgit v1.2.3-70-g09d2


From 018378c55b03f88ff513aba4e0e93b8d4a9cf241 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 3 Jun 2010 21:32:43 +0200
Subject: x86: Unify save_stack_address() and save_stack_address_nosched()

Cleanup. Factor the common code in save_stack_address() and
save_stack_address_nosched().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100603193243.GA31534@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/stacktrace.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index abc321d5587..b53c525368a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,13 +23,16 @@ static int save_stack_stack(void *data, char *name)
 	return 0;
 }
 
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void
+__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
 {
 	struct stack_trace *trace = data;
 #ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
 		return;
 #endif
+	if (nosched && in_sched_functions(addr))
+		return;
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
@@ -38,22 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
+static void save_stack_address(void *data, unsigned long addr, int reliable)
+{
+	return __save_stack_address(data, addr, reliable, false);
+}
+
 static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
-	struct stack_trace *trace = data;
-#ifdef CONFIG_FRAME_POINTER
-	if (!reliable)
-		return;
-#endif
-	if (in_sched_functions(addr))
-		return;
-	if (trace->skip > 0) {
-		trace->skip--;
-		return;
-	}
-	if (trace->nr_entries < trace->max_entries)
-		trace->entries[trace->nr_entries++] = addr;
+	return __save_stack_address(data, addr, reliable, true);
 }
 
 static const struct stacktrace_ops save_stack_ops = {
-- 
cgit v1.2.3-70-g09d2


From d6d4d4205cf4ce4ba13bc320305afbda25303496 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 3 Jun 2010 12:07:46 +0200
Subject: x86, xsave: Cleanup return codes in check_for_xstate()

The places which call check_for_xstate() only care about zero or
non-zero so this patch doesn't change how the code runs, but it's a
cleanup.  The main reason for this patch is that I'm looking for places
which don't return -EFAULT for copy_from_user() failures.

Signed-off-by: Dan Carpenter <error27@gmail.com>
LKML-Reference: <20100603100746.GU5483@bicker>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/kernel/xsave.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24..980149867a1 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -36,15 +36,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
 
 	err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
 			       sizeof(struct _fpx_sw_bytes));
-
 	if (err)
-		return err;
+		return -EFAULT;
 
 	/*
 	 * First Magic check failed.
 	 */
 	if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
-		return -1;
+		return -EINVAL;
 
 	/*
 	 * Check for error scenarios.
@@ -52,19 +51,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
 	if (fx_sw_user->xstate_size < min_xstate_size ||
 	    fx_sw_user->xstate_size > xstate_size ||
 	    fx_sw_user->xstate_size > fx_sw_user->extended_size)
-		return -1;
+		return -EINVAL;
 
 	err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
 					    fx_sw_user->extended_size -
 					    FP_XSTATE_MAGIC2_SIZE));
+	if (err)
+		return err;
 	/*
 	 * Check for the presence of second magic word at the end of memory
 	 * layout. This detects the case where the user just copied the legacy
 	 * fpstate layout with out copying the extended state information
 	 * in the memory layout.
 	 */
-	if (err || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
+	if (magic2 != FP_XSTATE_MAGIC2)
+		return -EFAULT;
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 8cc1176e5de534d55cb26ff0cef3fd0d6ad8c3c0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@amd64.org>
Date: Wed, 2 Jun 2010 18:18:40 +0200
Subject: x86, cacheinfo: Carve out L3 cache slot accessors

This is in preparation for disabling L3 cache indices after having
received correctable ECCs in the L3 cache. Now we allow for initial
setting of a disabled index slot (write once) and deny writing new
indices to it after it has been disabled. Also, we deny using both slots
to disable one and the same index.

Userspace can restore the previously disabled indices by rewriting those
sysfs entries when booting.

Cleanup and reorganize code while at it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100602161840.GI18327@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 108 ++++++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf..898c2f4eab8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
 	return l3;
 }
 
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
+					   int index)
 {
 	int node;
 
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	this_leaf->l3 = l3_caches[node];
 }
 
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
+{
+	unsigned int reg = 0;
+
+	pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+
+	/* check whether this slot is activated already */
+	if (reg & (3UL << 30))
+		return reg & 0xfff;
+
+	return -1;
+}
+
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 				  unsigned int slot)
 {
-	struct pci_dev *dev = this_leaf->l3->dev;
-	unsigned int reg = 0;
+	int index;
 
 	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
-	if (!dev)
-		return -EINVAL;
+	index = amd_get_l3_disable_slot(this_leaf->l3, slot);
+	if (index >= 0)
+		return sprintf(buf, "%d\n", index);
 
-	pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
-	return sprintf(buf, "0x%08x\n", reg);
+	return sprintf(buf, "FREE\n");
 }
 
 #define SHOW_CACHE_DISABLE(slot)					\
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
 	}
 }
 
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-				   const char *buf, size_t count,
-				   unsigned int slot)
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3:    L3 cache descriptor
+ * @cpu:   A CPU on the node containing the L3 cache
+ * @slot:  slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
+			    unsigned long index)
 {
-	struct pci_dev *dev = this_leaf->l3->dev;
-	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	unsigned long val = 0;
+	int ret = 0;
 
 #define SUBCACHE_MASK	(3UL << 20)
 #define SUBCACHE_INDEX	0xfff
 
-	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+	/*
+	 * check whether this slot is already used or
+	 * the index is already disabled
+	 */
+	ret = amd_get_l3_disable_slot(l3, slot);
+	if (ret >= 0)
 		return -EINVAL;
 
+	/*
+	 * check whether the other slot has disabled the
+	 * same index already
+	 */
+	if (index == amd_get_l3_disable_slot(l3, !slot))
+		return -EINVAL;
+
+	/* do not allow writes outside of allowed bits */
+	if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+	    ((index & SUBCACHE_INDEX) > l3->indices))
+		return -EINVAL;
+
+	amd_l3_disable_index(l3, cpu, slot, index);
+
+	return 0;
+}
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+				  const char *buf, size_t count,
+				  unsigned int slot)
+{
+	unsigned long val = 0;
+	int cpu, err = 0;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!dev)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
-	if (strict_strtoul(buf, 10, &val) < 0)
-		return -EINVAL;
+	cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
 
-	/* do not allow writes outside of allowed bits */
-	if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
+	if (strict_strtoul(buf, 10, &val) < 0)
 		return -EINVAL;
 
-	amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
-
+	err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
+	if (err) {
+		if (err == -EEXIST)
+			printk(KERN_WARNING "L3 disable slot %d in use!\n",
+					    slot);
+		return err;
+	}
 	return count;
 }
 
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 
 #else	/* CONFIG_CPU_SUP_AMD */
 static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
 };
 #endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		amd_check_l3_disable(index, this_leaf);
+		amd_check_l3_disable(this_leaf, index);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
-- 
cgit v1.2.3-70-g09d2


From 12d8a961289644d265d8b3e88201878837c3b814 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 2 Jun 2010 20:29:21 +0200
Subject: x86, AMD: Extend support to future families

Extend support to future families, and in particular:

* extend direct mapping split of Tseg SMM area.
* extend K8 flavored alternatives (NOPS).
* rep movs* prefix is fast in ucode.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100602182921.GA21557@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/amd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d..12b9cff047c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		}
 
 	}
-	if (c->x86 == 0x10 || c->x86 == 0x11)
+	if (c->x86 >= 0x10)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
 	/* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 			num_cache_leaves = 3;
 	}
 
-	if (c->x86 >= 0xf && c->x86 <= 0x11)
+	if (c->x86 >= 0xf)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		fam10h_check_enable_mmcfg();
 	}
 
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+	if (c == &boot_cpu_data && c->x86 >= 0xf) {
 		unsigned long long tseg;
 
 		/*
-- 
cgit v1.2.3-70-g09d2


From 1f9a0bd4989fd16842ad71fc89240b48ab191446 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 8 Jun 2010 14:09:08 +0800
Subject: x86, mce: Rename MSR_IA32_MCx_CTL2 value

Rename CMCI_EN to MCI_CTL2_CMCI_EN and CMCI_THRESHOLD_MASK to
MCI_CTL2_CMCI_THRESHOLD_MASK to make naming consistent.

Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1275977348.3444.659.camel@yhuang-dev.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h             | 4 ++++
 arch/x86/include/asm/msr-index.h       | 3 ---
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f32a4301c4d..82db1d8f064 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,10 @@
 #define MCM_ADDR_MEM	 3	/* memory address */
 #define MCM_ADDR_GENERIC 7	/* generic */
 
+/* CTL2 register defines */
+#define MCI_CTL2_CMCI_EN		(1ULL << 30)
+#define MCI_CTL2_CMCI_THRESHOLD_MASK	0xffffULL
+
 #define MCJ_CTX_MASK		3
 #define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
 #define MCJ_CTX_RANDOM		0    /* inject context: random */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b49d8ca228f..38f66eb5854 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -94,9 +94,6 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
-#define CMCI_EN			(1ULL << 30)
-#define CMCI_THRESHOLD_MASK		0xffffULL
-
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920..faf7b2919a8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,19 @@ static void cmci_discover(int banks, int boot)
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Already owned by someone else? */
-		if (val & CMCI_EN) {
+		if (val & MCI_CTL2_CMCI_EN) {
 			if (test_and_clear_bit(i, owned) && !boot)
 				print_update("SHD", &hdr, i);
 			__clear_bit(i, __get_cpu_var(mce_poll_banks));
 			continue;
 		}
 
-		val |= CMCI_EN | CMCI_THRESHOLD;
+		val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
 		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Did the enable bit stick? -- the bank supports CMCI */
-		if (val & CMCI_EN) {
+		if (val & MCI_CTL2_CMCI_EN) {
 			if (!test_and_set_bit(i, owned) && !boot)
 				print_update("CMCI", &hdr, i);
 			__clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +155,7 @@ void cmci_clear(void)
 			continue;
 		/* Disable CMCI */
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+		val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
 		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		__clear_bit(i, __get_cpu_var(mce_banks_owned));
 	}
-- 
cgit v1.2.3-70-g09d2


From 3c417588603e5411f29d22a40f3b5ff71529a4f0 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 8 Jun 2010 14:09:10 +0800
Subject: x86, mce: Fix MSR_IA32_MCI_CTL2 CMCI threshold setup

It is reported that CMCI is not raised when number of corrected error
reaches preset threshold. After inspection, it is found that
MSR_IA32_MCI_CTL2 threshold field is not setup properly. This patch
fixed it.

Value of MCI_CTL2_CMCI_THRESHOLD_MASK is fixed according to x86_64
Software Developer's Manual too.

Reported-by: Shaohui Zheng <shaohui.zheng@intel.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1275977350.3444.660.camel@yhuang-dev.sh.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h             | 2 +-
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 82db1d8f064..c62c13cb978 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
 
 /* CTL2 register defines */
 #define MCI_CTL2_CMCI_EN		(1ULL << 30)
-#define MCI_CTL2_CMCI_THRESHOLD_MASK	0xffffULL
+#define MCI_CTL2_CMCI_THRESHOLD_MASK	0x7fffULL
 
 #define MCJ_CTX_MASK		3
 #define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index faf7b2919a8..6fcd0936194 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -102,6 +102,7 @@ static void cmci_discover(int banks, int boot)
 			continue;
 		}
 
+		val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
 		val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
 		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-- 
cgit v1.2.3-70-g09d2


From a2d7b0d4852536273b65d16fe179c65184fe5e2d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 8 Jun 2010 14:35:39 +0800
Subject: x86, mce: Use HW_ERR in MCE handler

Use HW_ERR printk prefix in MCE handler. To make it more explicit that
this is hardware error instead of software error.

Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1275978939.3444.668.camel@yhuang-dev.sh.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..094b228c8b0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 static int default_decode_mce(struct notifier_block *nb, unsigned long val,
 			       void *data)
 {
-	pr_emerg("No human readable MCE decoding support on this CPU type.\n");
-	pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+	pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
+	pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
 
 	return NOTIFY_STOP;
 }
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
 
 static void print_mce(struct mce *m)
 {
-	pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
 
 	if (m->ip) {
-		pr_emerg("RIP%s %02x:<%016Lx> ",
+		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 				m->cs, m->ip);
 
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
 		pr_cont("\n");
 	}
 
-	pr_emerg("TSC %llx ", m->tsc);
+	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 	if (m->addr)
 		pr_cont("ADDR %llx ", m->addr);
 	if (m->misc)
 		pr_cont("MISC %llx ", m->misc);
 
 	pr_cont("\n");
-	pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
 
 	/*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
 	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 }
 
-static void print_mce_head(void)
-{
-	pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
-	pr_emerg("This is not a software problem!\n");
-}
-
 #define PANIC_TIMEOUT 5 /* 5 seconds */
 
 static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		if (atomic_inc_return(&mce_fake_paniced) > 1)
 			return;
 	}
-	print_mce_head();
 	/* First print corrected ones that are still unlogged */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 			apei_err = apei_write_mce(final);
 	}
 	if (cpu_missing)
-		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
-	print_mce_tail();
+		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 	if (exp)
-		printk(KERN_EMERG "Machine check: %s\n", exp);
+		pr_emerg(HW_ERR "Machine check: %s\n", exp);
 	if (!fake_panic) {
 		if (panic_timeout == 0)
 			panic_timeout = mce_panic_timeout;
 		panic(msg);
 	} else
-		printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 }
 
 /* Support code for software error injection */
@@ -1220,7 +1208,7 @@ int mce_notify_irq(void)
 			schedule_work(&mce_trigger_work);
 
 		if (__ratelimit(&ratelimit))
-			printk(KERN_INFO "Machine check events logged\n");
+			pr_info(HW_ERR "Machine check events logged\n");
 
 		return 1;
 	}
-- 
cgit v1.2.3-70-g09d2


From ec8c27e04f89a7575ca2c4facb99152e03d6a99c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 30 Apr 2010 06:45:36 -0700
Subject: mce: convert to rcu_dereference_index_check()

The mce processing applies rcu_dereference_check() to integers used as
array indices.  This patch therefore moves mce to the new RCU API
rcu_dereference_index_check() that avoids the sparse processing that
would otherwise result in compiler errors.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..0e78657e29c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
 static DEFINE_MUTEX(mce_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
-	rcu_dereference_check((p), \
+	rcu_dereference_index_check((p), \
 			      rcu_read_lock_sched_held() || \
 			      lockdep_is_held(&mce_read_mutex))
 
-- 
cgit v1.2.3-70-g09d2


From 421f91d21ad6f799dc7b489bb33cc560ccc56f98 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 11 Jun 2010 12:17:00 +0200
Subject: fix typos concerning "initiali[zs]e"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-msm/acpuclock-arm11.c      |  4 ++--
 arch/arm/mach-u300/gpio.c                |  2 +-
 arch/arm/plat-s3c24xx/clock.c            |  2 +-
 arch/arm/plat-samsung/clock.c            |  2 +-
 arch/h8300/kernel/timer/itu.c            |  2 +-
 arch/h8300/kernel/timer/timer16.c        |  2 +-
 arch/h8300/kernel/timer/timer8.c         |  2 +-
 arch/ia64/kvm/kvm-ia64.c                 |  6 +++---
 arch/ia64/sn/kernel/setup.c              |  2 +-
 arch/sparc/boot/btfixupprep.c            |  4 ++--
 arch/x86/kernel/apic/apic.c              |  2 +-
 arch/x86/kernel/head32.c                 |  2 +-
 drivers/crypto/amcc/crypto4xx_reg_def.h  |  2 +-
 drivers/dma/at_hdmac.c                   |  2 +-
 drivers/gpu/drm/savage/savage_bci.c      |  2 +-
 drivers/ide/ide-gd.c                     |  2 +-
 drivers/infiniband/hw/ehca/hcp_if.h      |  2 +-
 drivers/input/misc/ad714x.c              |  2 +-
 drivers/media/video/ov511.c              |  2 +-
 drivers/media/video/zoran/zoran.h        |  2 +-
 drivers/media/video/zoran/zr36050.c      |  2 +-
 drivers/media/video/zoran/zr36060.c      |  2 +-
 drivers/message/fusion/mptbase.c         |  4 ++--
 drivers/mtd/nand/denali.c                |  2 +-
 drivers/net/3c527.c                      |  4 ++--
 drivers/net/appletalk/ipddp.c            |  2 +-
 drivers/net/hp100.c                      |  2 +-
 drivers/net/ibm_newemac/core.c           |  2 +-
 drivers/net/ksz884x.c                    |  2 +-
 drivers/net/ll_temac_main.c              |  2 +-
 drivers/net/tulip/dmfe.c                 | 20 ++++++++++----------
 drivers/net/wimax/i2400m/control.c       |  2 +-
 drivers/parisc/ccio-dma.c                |  4 ++--
 drivers/pcmcia/sa11xx_base.c             |  2 +-
 drivers/scsi/advansys.c                  |  2 +-
 drivers/scsi/aic94xx/aic94xx_seq.c       |  4 ++--
 drivers/scsi/bfa/vport.c                 |  2 +-
 drivers/scsi/pm8001/pm8001_hwi.c         |  2 +-
 drivers/scsi/qla4xxx/ql4_init.c          |  2 +-
 drivers/serial/sn_console.c              |  6 +++---
 drivers/staging/comedi/drivers/usbdux.c  |  2 +-
 drivers/staging/octeon/cvmx-cmd-queue.c  |  6 +++---
 drivers/staging/pohmelfs/inode.c         |  2 +-
 drivers/staging/rt2860/common/cmm_wpa.c  |  4 ++--
 drivers/staging/rtl8192e/r8190_rtl8256.c |  6 +++---
 drivers/usb/serial/kl5kusb105.c          |  2 +-
 drivers/usb/wusbcore/wusbhc.c            |  2 +-
 drivers/uwb/wlp/wss-lc.c                 |  2 +-
 drivers/video/carminefb.c                |  2 +-
 drivers/video/tgafb.c                    |  2 +-
 fs/befs/linuxvfs.c                       |  2 +-
 fs/ecryptfs/crypto.c                     |  2 +-
 fs/ext4/extents.c                        |  2 +-
 fs/ext4/super.c                          |  2 +-
 fs/freevxfs/vxfs_super.c                 |  2 +-
 fs/ocfs2/super.c                         |  2 +-
 fs/reiserfs/inode.c                      |  2 +-
 lib/random32.c                           |  2 +-
 net/netfilter/ipvs/ip_vs_lblc.c          |  2 +-
 net/netfilter/ipvs/ip_vs_lblcr.c         |  2 +-
 net/sctp/associola.c                     |  2 +-
 net/sctp/protocol.c                      |  2 +-
 security/smack/smack_lsm.c               |  2 +-
 sound/pci/trident/trident_main.c         |  2 +-
 sound/soc/fsl/mpc8610_hpcd.c             |  2 +-
 sound/soc/soc-core.c                     |  2 +-
 66 files changed, 90 insertions(+), 90 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/mach-msm/acpuclock-arm11.c b/arch/arm/mach-msm/acpuclock-arm11.c
index af5e85b91d0..f060a3959a7 100644
--- a/arch/arm/mach-msm/acpuclock-arm11.c
+++ b/arch/arm/mach-msm/acpuclock-arm11.c
@@ -98,7 +98,7 @@ struct clkctl_acpu_speed {
 
 /*
  * ACPU speed table. Complete table is shown but certain speeds are commented
- * out to optimized speed switching. Initalize loops_per_jiffy to 0.
+ * out to optimized speed switching. Initialize loops_per_jiffy to 0.
  *
  * Table stepping up/down is optimized for 256mhz jumps while staying on the
  * same PLL.
@@ -494,7 +494,7 @@ uint32_t acpuclk_get_switch_time(void)
  * Clock driver initialization
  *---------------------------------------------------------------------------*/
 
-/* Initalize the lpj field in the acpu_freq_tbl. */
+/* Initialize the lpj field in the acpu_freq_tbl. */
 static void __init lpj_init(void)
 {
 	int i;
diff --git a/arch/arm/mach-u300/gpio.c b/arch/arm/mach-u300/gpio.c
index 5f61fd45a0c..d92790140fe 100644
--- a/arch/arm/mach-u300/gpio.c
+++ b/arch/arm/mach-u300/gpio.c
@@ -523,7 +523,7 @@ static void gpio_set_initial_values(void)
 
 	/*
 	 * Put all pins that are set to either 'GPIO_OUT' or 'GPIO_NOT_USED'
-	 * to output and 'GPIO_IN' to input for each port. And initalize
+	 * to output and 'GPIO_IN' to input for each port. And initialize
 	 * default value on outputs.
 	 */
 	for (i = 0; i < U300_GPIO_NUM_PORTS; i++) {
diff --git a/arch/arm/plat-s3c24xx/clock.c b/arch/arm/plat-s3c24xx/clock.c
index 8474d05274b..931d26d1a54 100644
--- a/arch/arm/plat-s3c24xx/clock.c
+++ b/arch/arm/plat-s3c24xx/clock.c
@@ -43,7 +43,7 @@
 #include <plat/cpu.h>
 #include <plat/pll.h>
 
-/* initalise all the clocks */
+/* initialise all the clocks */
 
 void __init_or_cpufreq s3c24xx_setup_clocks(unsigned long fclk,
 					   unsigned long hclk,
diff --git a/arch/arm/plat-samsung/clock.c b/arch/arm/plat-samsung/clock.c
index 8bf79f3efdf..90a20512d68 100644
--- a/arch/arm/plat-samsung/clock.c
+++ b/arch/arm/plat-samsung/clock.c
@@ -391,7 +391,7 @@ void __init s3c_disable_clocks(struct clk *clkp, int nr_clks)
 		(clkp->enable)(clkp, 0);
 }
 
-/* initalise all the clocks */
+/* initialise all the clocks */
 
 int __init s3c24xx_register_baseclocks(unsigned long xtal)
 {
diff --git a/arch/h8300/kernel/timer/itu.c b/arch/h8300/kernel/timer/itu.c
index 4883ba7103a..a2ae5e95213 100644
--- a/arch/h8300/kernel/timer/itu.c
+++ b/arch/h8300/kernel/timer/itu.c
@@ -73,7 +73,7 @@ void __init h8300_timer_setup(void)
 
 	setup_irq(ITUIRQ, &itu_irq);
 
-	/* initalize timer */
+	/* initialize timer */
 	ctrl_outb(0, TSTR);
 	ctrl_outb(CCLR0 | div, ITUBASE + TCR);
 	ctrl_outb(0x01, ITUBASE + TIER);
diff --git a/arch/h8300/kernel/timer/timer16.c b/arch/h8300/kernel/timer/timer16.c
index 042dbb53f3f..ae0d3816113 100644
--- a/arch/h8300/kernel/timer/timer16.c
+++ b/arch/h8300/kernel/timer/timer16.c
@@ -68,7 +68,7 @@ void __init h8300_timer_setup(void)
 
 	setup_irq(_16IRQ, &timer16_irq);
 
-	/* initalize timer */
+	/* initialize timer */
 	ctrl_outb(0, TSTR);
 	ctrl_outb(CCLR0 | div, _16BASE + TCR);
 	ctrl_outw(cnt, _16BASE + GRA);
diff --git a/arch/h8300/kernel/timer/timer8.c b/arch/h8300/kernel/timer/timer8.c
index 38be0cabef0..3946c0fa837 100644
--- a/arch/h8300/kernel/timer/timer8.c
+++ b/arch/h8300/kernel/timer/timer8.c
@@ -94,7 +94,7 @@ void __init h8300_timer_setup(void)
 	ctrl_bclr(0, MSTPCRL)
 #endif
 
-	/* initalize timer */
+	/* initialize timer */
 	ctrl_outw(cnt, _8BASE + TCORA);
 	ctrl_outw(0x0000, _8BASE + _8TCSR);
 	ctrl_outw((CMIEA|CCLR_CMA|CKS2) << 8 | div,
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7f3c0a2e60c..29afd9a252f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1234,7 +1234,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	p_ctx->cr[2] = (unsigned long)kvm_vmm_info->vmm_ivt;
 	p_ctx->cr[8] = 0x3c;
 
-	/*Initilize region register*/
+	/*Initialize region register*/
 	p_ctx->rr[0] = 0x30;
 	p_ctx->rr[1] = 0x30;
 	p_ctx->rr[2] = 0x30;
@@ -1243,7 +1243,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	p_ctx->rr[5] = 0x30;
 	p_ctx->rr[7] = 0x30;
 
-	/*Initilize branch register 0*/
+	/*Initialize branch register 0*/
 	p_ctx->br[0] = *(unsigned long *)kvm_vmm_info->vmm_entry;
 
 	vcpu->arch.vmm_rr = kvm->arch.vmm_init_rr;
@@ -1702,7 +1702,7 @@ static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info,
 	BUG_ON(!module);
 
 	if (!kvm_vmm_base) {
-		printk("kvm: kvm area hasn't been initilized yet!!\n");
+		printk("kvm: kvm area hasn't been initialized yet!!\n");
 		return -EFAULT;
 	}
 
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index d00dfc18002..dbc4cbecb5e 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -507,7 +507,7 @@ static void __init sn_init_pdas(char **cmdline_p)
 	cnodeid_t cnode;
 
 	/*
-	 * Allocate & initalize the nodepda for each node.
+	 * Allocate & initialize the nodepda for each node.
 	 */
 	for_each_online_node(cnode) {
 		nodepdaindr[cnode] =
diff --git a/arch/sparc/boot/btfixupprep.c b/arch/sparc/boot/btfixupprep.c
index bbf91b9c3d3..b6049110223 100644
--- a/arch/sparc/boot/btfixupprep.c
+++ b/arch/sparc/boot/btfixupprep.c
@@ -216,7 +216,7 @@ main1:
 		switch (buffer[nbase+3]) {
 		case 'f':
 			if (initval) {
-				fprintf(stderr, "Cannot use pre-initalized fixups for calls\n%s\n", buffer);
+				fprintf(stderr, "Cannot use pre-initialized fixups for calls\n%s\n", buffer);
 				exit(1);
 			}
 			if (!strcmp (sect, "__ksymtab")) {
@@ -273,7 +273,7 @@ main1:
 			break;
 		case 'i':
 			if (initval) {
-				fprintf(stderr, "Cannot use pre-initalized fixups for INT\n%s\n", buffer);
+				fprintf(stderr, "Cannot use pre-initialized fixups for INT\n%s\n", buffer);
 				exit(1);
 			}
 			if (strncmp (buffer + mode+9, "HI22      ", 10) && strncmp (buffer + mode+9, "LO10      ", 10)) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e0161..192cd7ee35c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -459,7 +459,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
 }
 
 /*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * Setup the local APIC timer for this CPU. Copy the initialized values
  * of the boot CPU and register the clock event in the framework.
  */
 static void __cpuinit setup_APIC_timer(void)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e24603739..784360c0625 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
 
 static void __init i386_default_early_setup(void)
 {
-	/* Initilize 32bit specific setup functions */
+	/* Initialize 32bit specific setup functions */
 	x86_init.resources.probe_roms = probe_roms;
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/drivers/crypto/amcc/crypto4xx_reg_def.h b/drivers/crypto/amcc/crypto4xx_reg_def.h
index 7d4edb00261..5f5fbc0716f 100644
--- a/drivers/crypto/amcc/crypto4xx_reg_def.h
+++ b/drivers/crypto/amcc/crypto4xx_reg_def.h
@@ -113,7 +113,7 @@
 #define CRYPTO4XX_PRNG_LFSR_H			0x00070034
 
 /**
- * Initilize CRYPTO ENGINE registers, and memory bases.
+ * Initialize CRYPTO ENGINE registers, and memory bases.
  */
 #define PPC4XX_PDR_POLL				0x3ff
 #define PPC4XX_OUTPUT_THRESHOLD			2
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 278cf5bceef..308ab320e20 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -69,7 +69,7 @@ static struct at_desc *atc_first_queued(struct at_dma_chan *atchan)
 }
 
 /**
- * atc_alloc_descriptor - allocate and return an initilized descriptor
+ * atc_alloc_descriptor - allocate and return an initialized descriptor
  * @chan: the channel to allocate descriptors for
  * @gfp_flags: GFP allocation flags
  *
diff --git a/drivers/gpu/drm/savage/savage_bci.c b/drivers/gpu/drm/savage/savage_bci.c
index 2d0c9ca484c..fa05cda8c98 100644
--- a/drivers/gpu/drm/savage/savage_bci.c
+++ b/drivers/gpu/drm/savage/savage_bci.c
@@ -552,7 +552,7 @@ int savage_driver_load(struct drm_device *dev, unsigned long chipset)
 
 
 /*
- * Initalize mappings. On Savage4 and SavageIX the alignment
+ * Initialize mappings. On Savage4 and SavageIX the alignment
  * and size of the aperture is not suitable for automatic MTRR setup
  * in drm_addmap. Therefore we add them manually before the maps are
  * initialized, and tear them down on last close.
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c32d83996ae..27d9fe08d80 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -92,7 +92,7 @@ static void ide_disk_release(struct device *dev)
 
 /*
  * On HPA drives the capacity needs to be
- * reinitilized on resume otherwise the disk
+ * reinitialized on resume otherwise the disk
  * can not be used and a hard reset is required
  */
 static void ide_gd_resume(ide_drive_t *drive)
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
index 39c1c3618ec..a46e514c367 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.h
+++ b/drivers/infiniband/hw/ehca/hcp_if.h
@@ -49,7 +49,7 @@
 #include "hipz_hw.h"
 
 /*
- * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initalize
+ * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
  * resources, create the empty EQPT (ring).
  */
 u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
diff --git a/drivers/input/misc/ad714x.c b/drivers/input/misc/ad714x.c
index 0fe27baf5e7..c431d09e401 100644
--- a/drivers/input/misc/ad714x.c
+++ b/drivers/input/misc/ad714x.c
@@ -1118,7 +1118,7 @@ struct ad714x_chip *ad714x_probe(struct device *dev, u16 bus_type, int irq,
 	if (error)
 		goto err_free_mem;
 
-	/* initilize and request sw/hw resources */
+	/* initialize and request sw/hw resources */
 
 	ad714x_hw_init(ad714x);
 	mutex_init(&ad714x->mutex);
diff --git a/drivers/media/video/ov511.c b/drivers/media/video/ov511.c
index a10912097b7..78a6eb698b0 100644
--- a/drivers/media/video/ov511.c
+++ b/drivers/media/video/ov511.c
@@ -4808,7 +4808,7 @@ ov7xx0_configure(struct usb_ov511 *ov)
 		return -1;
 
 	if (init_ov_sensor(ov) >= 0) {
-		PDEBUG(1, "OV7xx0 sensor initalized (method 1)");
+		PDEBUG(1, "OV7xx0 sensor initialized (method 1)");
 	} else {
 		/* Reset the 76xx */
 		if (i2c_w(ov, 0x12, 0x80) < 0)
diff --git a/drivers/media/video/zoran/zoran.h b/drivers/media/video/zoran/zoran.h
index 8997add1248..307e847fe1c 100644
--- a/drivers/media/video/zoran/zoran.h
+++ b/drivers/media/video/zoran/zoran.h
@@ -391,7 +391,7 @@ struct zoran {
 
 	struct mutex resource_lock;	/* prevent evil stuff */
 
-	u8 initialized;		/* flag if zoran has been correctly initalized */
+	u8 initialized;		/* flag if zoran has been correctly initialized */
 	int user;		/* number of current users */
 	struct card_info card;
 	struct tvnorm *timing;
diff --git a/drivers/media/video/zoran/zr36050.c b/drivers/media/video/zoran/zr36050.c
index 639dd87c663..e1985609af4 100644
--- a/drivers/media/video/zoran/zr36050.c
+++ b/drivers/media/video/zoran/zr36050.c
@@ -236,7 +236,7 @@ zr36050_pushit (struct zr36050 *ptr,
 
    Could be variable, but until it's not needed it they are just fixed to save
    memory. Otherwise expand zr36050 structure with arrays, push the values to
-   it and initalize from there, as e.g. the linux zr36057/60 driver does it.
+   it and initialize from there, as e.g. the linux zr36057/60 driver does it.
    ========================================================================= */
 
 static const char zr36050_dqt[0x86] = {
diff --git a/drivers/media/video/zoran/zr36060.c b/drivers/media/video/zoran/zr36060.c
index 008746ff774..5e4f57cbf31 100644
--- a/drivers/media/video/zoran/zr36060.c
+++ b/drivers/media/video/zoran/zr36060.c
@@ -227,7 +227,7 @@ zr36060_pushit (struct zr36060 *ptr,
 
    Could be variable, but until it's not needed it they are just fixed to save
    memory. Otherwise expand zr36060 structure with arrays, push the values to
-   it and initalize from there, as e.g. the linux zr36057/60 driver does it.
+   it and initialize from there, as e.g. the linux zr36057/60 driver does it.
    ========================================================================= */
 
 static const char zr36060_dqt[0x86] = {
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index a6a57011ba6..14d162fb8a2 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1794,7 +1794,7 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	ioc->sh = NULL;
 	ioc->cached_fw = NULL;
 
-	/* Initilize SCSI Config Data structure
+	/* Initialize SCSI Config Data structure
 	 */
 	memset(&ioc->spi_data, 0, sizeof(SpiCfgData));
 
@@ -2471,7 +2471,7 @@ mpt_do_ioc_recovery(MPT_ADAPTER *ioc, u32 reason, int sleepFlag)
 	if ((ret == 0) && (reason == MPT_HOSTEVENT_IOC_BRINGUP)) {
 
 		/*
-		 * Initalize link list for inactive raid volumes.
+		 * Initialize link list for inactive raid volumes.
 		 */
 		mutex_init(&ioc->raid_data.inactive_list_mutex);
 		INIT_LIST_HEAD(&ioc->raid_data.inactive_list);
diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
index ca03428b59c..3dfda9cc677 100644
--- a/drivers/mtd/nand/denali.c
+++ b/drivers/mtd/nand/denali.c
@@ -1836,7 +1836,7 @@ static struct nand_bbt_descr bbt_mirror_descr = {
 	.pattern = mirror_pattern,
 };
 
-/* initalize driver data structures */
+/* initialize driver data structures */
 void denali_drv_init(struct denali_nand_info *denali)
 {
 	denali->idx = 0;
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 38395dfa496..70705d1306b 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -729,14 +729,14 @@ static void mc32_halt_transceiver(struct net_device *dev)
  *	mc32_load_rx_ring	-	load the ring of receive buffers
  *	@dev: 3c527 to build the ring for
  *
- *	This initalises the on-card and driver datastructures to
+ *	This initialises the on-card and driver datastructures to
  *	the point where mc32_start_transceiver() can be called.
  *
  *	The card sets up the receive ring for us. We are required to use the
  *	ring it provides, although the size of the ring is configurable.
  *
  * 	We allocate an sk_buff for each ring entry in turn and
- * 	initalise its house-keeping info. At the same time, we read
+ * 	initialise its house-keeping info. At the same time, we read
  * 	each 'next' pointer in our rx_ring array. This reduces slow
  * 	shared-memory reads and makes it easy to access predecessor
  * 	descriptors.
diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index 79636ee3582..0362c8d31a0 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -80,7 +80,7 @@ static struct net_device * __init ipddp_init(void)
 	if (version_printed++ == 0)
                 printk(version);
 
-	/* Initalize the device structure. */
+	/* Initialize the device structure. */
 	dev->netdev_ops = &ipddp_netdev_ops;
 
         dev->type = ARPHRD_IPDDP;       	/* IP over DDP tunnel */
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index 68e5ac8832a..dfc787fa8b1 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -1071,7 +1071,7 @@ static void hp100_mmuinit(struct net_device *dev)
 	if (lp->mode == 1)
 		hp100_init_pdls(dev);
 
-	/* Go to performance page and initalize isr and imr registers */
+	/* Go to performance page and initialize isr and imr registers */
 	hp100_page(PERFORMANCE);
 	hp100_outw(0xfefe, IRQ_MASK);	/* mask off all ints */
 	hp100_outw(0xffff, IRQ_STATUS);	/* ack IRQ */
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index 2484e9e6c1e..6a45f8f3a0c 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -1044,7 +1044,7 @@ static int emac_change_mtu(struct net_device *ndev, int new_mtu)
 	DBG(dev, "change_mtu(%d)" NL, new_mtu);
 
 	if (netif_running(ndev)) {
-		/* Check if we really need to reinitalize RX ring */
+		/* Check if we really need to reinitialize RX ring */
 		if (emac_rx_skb_size(ndev->mtu) != emac_rx_skb_size(new_mtu))
 			ret = emac_resize_rx_ring(dev, new_mtu);
 	}
diff --git a/drivers/net/ksz884x.c b/drivers/net/ksz884x.c
index c80ca64277b..c02ce1ab657 100644
--- a/drivers/net/ksz884x.c
+++ b/drivers/net/ksz884x.c
@@ -6812,7 +6812,7 @@ static int stp;
 static int fast_aging;
 
 /**
- * netdev_init - initalize network device.
+ * netdev_init - initialize network device.
  * @dev:	Network device.
  *
  * This function initializes the network device.
diff --git a/drivers/net/ll_temac_main.c b/drivers/net/ll_temac_main.c
index b59b24d667f..0ace2a46d31 100644
--- a/drivers/net/ll_temac_main.c
+++ b/drivers/net/ll_temac_main.c
@@ -449,7 +449,7 @@ static u32 temac_setoptions(struct net_device *ndev, u32 options)
 	return (0);
 }
 
-/* Initilize temac */
+/* Initialize temac */
 static void temac_device_reset(struct net_device *ndev)
 {
 	struct temac_local *lp = netdev_priv(ndev);
diff --git a/drivers/net/tulip/dmfe.c b/drivers/net/tulip/dmfe.c
index 29e6c63d39f..0bc4f3030a8 100644
--- a/drivers/net/tulip/dmfe.c
+++ b/drivers/net/tulip/dmfe.c
@@ -589,7 +589,7 @@ static int dmfe_open(struct DEVICE *dev)
 		db->dm910x_chk_mode = 1;	/* Enter the check mode */
 	}
 
-	/* Initilize DM910X board */
+	/* Initialize DM910X board */
 	dmfe_init_dm910x(dev);
 
 	/* Active System Interface */
@@ -606,9 +606,9 @@ static int dmfe_open(struct DEVICE *dev)
 }
 
 
-/*	Initilize DM910X board
+/*	Initialize DM910X board
  *	Reset DM910X board
- *	Initilize TX/Rx descriptor chain structure
+ *	Initialize TX/Rx descriptor chain structure
  *	Send the set-up frame
  *	Enable Tx/Rx machine
  */
@@ -649,7 +649,7 @@ static void dmfe_init_dm910x(struct DEVICE *dev)
 	if ( !(db->media_mode & DMFE_AUTO) )
 		db->op_mode = db->media_mode; 	/* Force Mode */
 
-	/* Initiliaze Transmit/Receive decriptor and CR3/4 */
+	/* Initialize Transmit/Receive decriptor and CR3/4 */
 	dmfe_descriptor_init(db, ioaddr);
 
 	/* Init CR6 to program DM910x operation */
@@ -1288,7 +1288,7 @@ static void dmfe_timer(unsigned long data)
  *	Stop DM910X board
  *	Free Tx/Rx allocated memory
  *	Reset DM910X board
- *	Re-initilize DM910X board
+ *	Re-initialize DM910X board
  */
 
 static void dmfe_dynamic_reset(struct DEVICE *dev)
@@ -1316,7 +1316,7 @@ static void dmfe_dynamic_reset(struct DEVICE *dev)
 	netif_carrier_off(dev);
 	db->wait_reset = 0;
 
-	/* Re-initilize DM910X board */
+	/* Re-initialize DM910X board */
 	dmfe_init_dm910x(dev);
 
 	/* Restart upper layer interface */
@@ -1447,7 +1447,7 @@ static void update_cr6(u32 cr6_data, unsigned long ioaddr)
 
 /*
  *	Send a setup frame for DM9132
- *	This setup frame initilize DM910X address filter mode
+ *	This setup frame initialize DM910X address filter mode
 */
 
 static void dm9132_id_table(struct DEVICE *dev)
@@ -1489,7 +1489,7 @@ static void dm9132_id_table(struct DEVICE *dev)
 
 /*
  *	Send a setup frame for DM9102/DM9102A
- *	This setup frame initilize DM910X address filter mode
+ *	This setup frame initialize DM910X address filter mode
  */
 
 static void send_filter_frame(struct DEVICE *dev)
@@ -2142,7 +2142,7 @@ static int dmfe_resume(struct pci_dev *pci_dev)
 	pci_set_power_state(pci_dev, PCI_D0);
 	pci_restore_state(pci_dev);
 
-	/* Re-initilize DM910X board */
+	/* Re-initialize DM910X board */
 	dmfe_init_dm910x(dev);
 
 	/* Disable WOL */
@@ -2196,7 +2196,7 @@ MODULE_PARM_DESC(SF_mode, "Davicom DM9xxx special function "
 
 /*	Description:
  *	when user used insmod to add module, system invoked init_module()
- *	to initilize and register.
+ *	to initialize and register.
  */
 
 static int __init dmfe_init_module(void)
diff --git a/drivers/net/wimax/i2400m/control.c b/drivers/net/wimax/i2400m/control.c
index d86e8f31e7f..7f48e040c3b 100644
--- a/drivers/net/wimax/i2400m/control.c
+++ b/drivers/net/wimax/i2400m/control.c
@@ -50,7 +50,7 @@
  *
  * ROADMAP
  *
- * i2400m_dev_initalize()       Called by i2400m_dev_start()
+ * i2400m_dev_initialize()       Called by i2400m_dev_start()
  *   i2400m_set_init_config()
  *   i2400m_cmd_get_state()
  * i2400m_dev_shutdown()        Called by i2400m_dev_stop()
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index f511e70d454..75a80e46b39 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1241,10 +1241,10 @@ static struct parisc_driver ccio_driver = {
 };
 
 /**
- * ccio_ioc_init - Initalize the I/O Controller
+ * ccio_ioc_init - Initialize the I/O Controller
  * @ioc: The I/O Controller.
  *
- * Initalize the I/O Controller which includes setting up the
+ * Initialize the I/O Controller which includes setting up the
  * I/O Page Directory, the resource map, and initalizing the
  * U2/Uturn chip into virtual mode.
  */
diff --git a/drivers/pcmcia/sa11xx_base.c b/drivers/pcmcia/sa11xx_base.c
index fa28d8911b0..0c62fe31a40 100644
--- a/drivers/pcmcia/sa11xx_base.c
+++ b/drivers/pcmcia/sa11xx_base.c
@@ -231,7 +231,7 @@ int sa11xx_drv_pcmcia_probe(struct device *dev, struct pcmcia_low_level *ops,
 
 	sinfo->nskt = nr;
 
-	/* Initiliaze processor specific parameters */
+	/* Initialize processor specific parameters */
 	for (i = 0; i < nr; i++) {
 		skt = &sinfo->skt[i];
 
diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index 7f87979da22..0ec3da6f3e1 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -9717,7 +9717,7 @@ static ushort __devinit AscInitAscDvcVar(ASC_DVC_VAR *asc_dvc)
 	asc_dvc->bug_fix_cntl = 0;
 	asc_dvc->pci_fix_asyn_xfer = 0;
 	asc_dvc->pci_fix_asyn_xfer_always = 0;
-	/* asc_dvc->init_state initalized in AscInitGetConfig(). */
+	/* asc_dvc->init_state initialized in AscInitGetConfig(). */
 	asc_dvc->sdtr_done = 0;
 	asc_dvc->cur_total_qng = 0;
 	asc_dvc->is_in_int = 0;
diff --git a/drivers/scsi/aic94xx/aic94xx_seq.c b/drivers/scsi/aic94xx/aic94xx_seq.c
index d01dcc62b39..74374618010 100644
--- a/drivers/scsi/aic94xx/aic94xx_seq.c
+++ b/drivers/scsi/aic94xx/aic94xx_seq.c
@@ -588,7 +588,7 @@ static void asd_init_cseq_mdp(struct asd_ha_struct *asd_ha)
  * asd_init_cseq_scratch -- setup and init CSEQ
  * @asd_ha: pointer to host adapter structure
  *
- * Setup and initialize Central sequencers. Initialiaze the mode
+ * Setup and initialize Central sequencers. Initialize the mode
  * independent and dependent scratch page to the default settings.
  */
 static void asd_init_cseq_scratch(struct asd_ha_struct *asd_ha)
@@ -782,7 +782,7 @@ static void asd_init_lseq_mdp(struct asd_ha_struct *asd_ha,  int lseq)
 	asd_write_reg_word(asd_ha, LmSEQ_OOB_INT_ENABLES(lseq), 0);
 	/*
 	 * Set the desired interval between transmissions of the NOTIFY
-	 * (ENABLE SPINUP) primitive.  Must be initilized to val - 1.
+	 * (ENABLE SPINUP) primitive.  Must be initialized to val - 1.
 	 */
 	asd_write_reg_word(asd_ha, LmSEQ_NOTIFY_TIMER_TIMEOUT(lseq),
 			   ASD_NOTIFY_TIMEOUT - 1);
diff --git a/drivers/scsi/bfa/vport.c b/drivers/scsi/bfa/vport.c
index 27cd619a227..e2720c8a666 100644
--- a/drivers/scsi/bfa/vport.c
+++ b/drivers/scsi/bfa/vport.c
@@ -789,7 +789,7 @@ bfa_cb_lps_fdisc_comp(void *bfad, void *uarg, bfa_status_t status)
 	switch (status) {
 	case BFA_STATUS_OK:
 		/*
-		 * Initialiaze the V-Port fields
+		 * Initialize the V-Port fields
 		 */
 		__vport_fcid(vport) = bfa_lps_get_pid(vport->lps);
 		vport->vport_stats.fdisc_accepts++;
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 0e05e8a2216..e81efac25fa 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -1082,7 +1082,7 @@ static void pm8001_hw_chip_rst(struct pm8001_hba_info *pm8001_ha)
 }
 
 /**
- * pm8001_chip_iounmap - which maped when initilized.
+ * pm8001_chip_iounmap - which maped when initialized.
  * @pm8001_ha: our hba card information
  */
 static void pm8001_chip_iounmap(struct pm8001_hba_info *pm8001_ha)
diff --git a/drivers/scsi/qla4xxx/ql4_init.c b/drivers/scsi/qla4xxx/ql4_init.c
index 5510df8a7fa..cd3043265a6 100644
--- a/drivers/scsi/qla4xxx/ql4_init.c
+++ b/drivers/scsi/qla4xxx/ql4_init.c
@@ -183,7 +183,7 @@ static int qla4xxx_validate_mac_address(struct scsi_qla_host *ha)
  **/
 static int qla4xxx_init_local_data(struct scsi_qla_host *ha)
 {
-	/* Initilize aen queue */
+	/* Initialize aen queue */
 	ha->aen_q_count = MAX_AEN_ENTRIES;
 
 	return qla4xxx_get_firmware_status(ha);
diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c
index 9794e0cd3dc..7e5e5efea4e 100644
--- a/drivers/serial/sn_console.c
+++ b/drivers/serial/sn_console.c
@@ -470,7 +470,7 @@ sn_receive_chars(struct sn_cons_port *port, unsigned long flags)
 	}
 
 	if (port->sc_port.state) {
-		/* The serial_core stuffs are initilized, use them */
+		/* The serial_core stuffs are initialized, use them */
 		tty = port->sc_port.state->port.tty;
 	}
 	else {
@@ -551,11 +551,11 @@ static void sn_transmit_chars(struct sn_cons_port *port, int raw)
 	BUG_ON(!port->sc_is_asynch);
 
 	if (port->sc_port.state) {
-		/* We're initilized, using serial core infrastructure */
+		/* We're initialized, using serial core infrastructure */
 		xmit = &port->sc_port.state->xmit;
 	} else {
 		/* Probably sn_sal_switch_to_asynch has been run but serial core isn't
-		 * initilized yet.  Just return.  Writes are going through
+		 * initialized yet.  Just return.  Writes are going through
 		 * sn_sal_console_write (due to register_console) at this time.
 		 */
 		return;
diff --git a/drivers/staging/comedi/drivers/usbdux.c b/drivers/staging/comedi/drivers/usbdux.c
index 8942ae45708..e7271685f23 100644
--- a/drivers/staging/comedi/drivers/usbdux.c
+++ b/drivers/staging/comedi/drivers/usbdux.c
@@ -2087,7 +2087,7 @@ static int usbdux_pwm_start(struct comedi_device *dev,
 	if (ret < 0)
 		return ret;
 
-	/* initalise the buffer */
+	/* initialise the buffer */
 	for (i = 0; i < this_usbduxsub->sizePwmBuf; i++)
 		((char *)(this_usbduxsub->urbPwm->transfer_buffer))[i] = 0;
 
diff --git a/drivers/staging/octeon/cvmx-cmd-queue.c b/drivers/staging/octeon/cvmx-cmd-queue.c
index 976227b0127..e9809d37516 100644
--- a/drivers/staging/octeon/cvmx-cmd-queue.c
+++ b/drivers/staging/octeon/cvmx-cmd-queue.c
@@ -140,21 +140,21 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_initialize(cvmx_cmd_queue_id_t queue_id,
 	if (qstate->base_ptr_div128) {
 		if (max_depth != (int)qstate->max_depth) {
 			cvmx_dprintf("ERROR: cvmx_cmd_queue_initialize: "
-				"Queue already initalized with different "
+				"Queue already initialized with different "
 				"max_depth (%d).\n",
 			     (int)qstate->max_depth);
 			return CVMX_CMD_QUEUE_INVALID_PARAM;
 		}
 		if (fpa_pool != qstate->fpa_pool) {
 			cvmx_dprintf("ERROR: cvmx_cmd_queue_initialize: "
-				"Queue already initalized with different "
+				"Queue already initialized with different "
 				"FPA pool (%u).\n",
 			     qstate->fpa_pool);
 			return CVMX_CMD_QUEUE_INVALID_PARAM;
 		}
 		if ((pool_size >> 3) - 1 != qstate->pool_size_m1) {
 			cvmx_dprintf("ERROR: cvmx_cmd_queue_initialize: "
-				"Queue already initalized with different "
+				"Queue already initialized with different "
 				"FPA pool size (%u).\n",
 			     (qstate->pool_size_m1 + 1) << 3);
 			return CVMX_CMD_QUEUE_INVALID_PARAM;
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 63275529ff5..fe8b093fb61 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -848,7 +848,7 @@ static void pohmelfs_destroy_inode(struct inode *inode)
 }
 
 /*
- * ->alloc_inode() callback. Allocates inode and initilizes private data.
+ * ->alloc_inode() callback. Allocates inode and initializes private data.
  */
 static struct inode *pohmelfs_alloc_inode(struct super_block *sb)
 {
diff --git a/drivers/staging/rt2860/common/cmm_wpa.c b/drivers/staging/rt2860/common/cmm_wpa.c
index 94e119faaa7..e1ead76b907 100644
--- a/drivers/staging/rt2860/common/cmm_wpa.c
+++ b/drivers/staging/rt2860/common/cmm_wpa.c
@@ -427,7 +427,7 @@ void RTMPToWirelessSta(struct rt_rtmp_adapter *pAd,
 /*
     ==========================================================================
     Description:
-        This is a function to initilize 4-way handshake
+        This is a function to initialize 4-way handshake
 
     Return:
 
@@ -867,7 +867,7 @@ void PeerPairMsg3Action(struct rt_rtmp_adapter *pAd,
     ==========================================================================
     Description:
         When receiving the last packet of 4-way pairwisekey handshake.
-        Initilize 2-way groupkey handshake following.
+        Initialize 2-way groupkey handshake following.
     Return:
     ==========================================================================
 */
diff --git a/drivers/staging/rtl8192e/r8190_rtl8256.c b/drivers/staging/rtl8192e/r8190_rtl8256.c
index 1bd054d42f2..eff47f9cddb 100644
--- a/drivers/staging/rtl8192e/r8190_rtl8256.c
+++ b/drivers/staging/rtl8192e/r8190_rtl8256.c
@@ -501,13 +501,13 @@ SetRFPowerState8190(
 				if((priv->ieee80211->eRFPowerState == eRfOff) && RT_IN_PS_LEVEL(pPSC, RT_RF_OFF_LEVL_HALT_NIC))
 				{ // The current RF state is OFF and the RF OFF level is halting the NIC, re-initialize the NIC.
 					bool rtstatus = true;
-					u32 InitilizeCount = 3;
+					u32 InitializeCount = 3;
 					do
 					{
-						InitilizeCount--;
+						InitializeCount--;
 						priv->RegRfOff = false;
 						rtstatus = NicIFEnableNIC(dev);
-					}while( (rtstatus != true) &&(InitilizeCount >0) );
+					}while( (rtstatus != true) &&(InitializeCount >0) );
 
 					if(rtstatus != true)
 					{
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index cdbe8bf7f67..e8a65ce45a2 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -261,7 +261,7 @@ static int klsi_105_startup(struct usb_serial *serial)
 
 		spin_lock_init(&priv->lock);
 
-		/* priv->termios is left uninitalized until port opening */
+		/* priv->termios is left uninitialized until port opening */
 		init_waitqueue_head(&serial->port[i]->write_wait);
 	}
 
diff --git a/drivers/usb/wusbcore/wusbhc.c b/drivers/usb/wusbcore/wusbhc.c
index eab86e4bc77..2054d4ee977 100644
--- a/drivers/usb/wusbcore/wusbhc.c
+++ b/drivers/usb/wusbcore/wusbhc.c
@@ -26,7 +26,7 @@
  * the one that requires (phase B, wusbhc_b_{create,destroy}).
  *
  * This is so because usb_add_hcd() will start the HC, and thus, all
- * the HC specific stuff has to be already initialiazed (like sysfs
+ * the HC specific stuff has to be already initialized (like sysfs
  * thingies).
  */
 #include <linux/device.h>
diff --git a/drivers/uwb/wlp/wss-lc.c b/drivers/uwb/wlp/wss-lc.c
index 90accdd54c0..a005d2a03b5 100644
--- a/drivers/uwb/wlp/wss-lc.c
+++ b/drivers/uwb/wlp/wss-lc.c
@@ -180,7 +180,7 @@ error_kobject_register:
  * If memory was allocated for the kobject's name then it will
  * be freed by the kobject system during this time.
  *
- * The EDA cache is removed and reinitilized when the WSS is removed. We
+ * The EDA cache is removed and reinitialized when the WSS is removed. We
  * thus loose knowledge of members of this WSS at that time and need not do
  * it here.
  */
diff --git a/drivers/video/carminefb.c b/drivers/video/carminefb.c
index d8345fcc4fe..6b19136aa18 100644
--- a/drivers/video/carminefb.c
+++ b/drivers/video/carminefb.c
@@ -432,7 +432,7 @@ static int init_hardware(struct carmine_hw *hw)
 	u32 loops;
 	u32 ret;
 
-	/* Initalize Carmine */
+	/* Initialize Carmine */
 	/* Sets internal clock */
 	c_set_hw_reg(hw, CARMINE_CTL_REG + CARMINE_CTL_REG_CLOCK_ENABLE,
 			CARMINE_DFLT_IP_CLOCK_ENABLE);
diff --git a/drivers/video/tgafb.c b/drivers/video/tgafb.c
index 1b3b1c718e8..aba7686b1a3 100644
--- a/drivers/video/tgafb.c
+++ b/drivers/video/tgafb.c
@@ -305,7 +305,7 @@ tgafb_set_par(struct fb_info *info)
 	TGA_WRITE_REG(par, htimings, TGA_HORIZ_REG);
 	TGA_WRITE_REG(par, vtimings, TGA_VERT_REG);
 
-	/* Initalise RAMDAC. */
+	/* Initialise RAMDAC. */
 	if (tga_type == TGA_TYPE_8PLANE && tga_bus_pci) {
 
 		/* Init BT485 RAMDAC registers.  */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda888e6..dc39d282488 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -436,7 +436,7 @@ befs_init_inodecache(void)
 					      init_once);
 	if (befs_inode_cachep == NULL) {
 		printk(KERN_ERR "befs_init_inodecache: "
-		       "Couldn't initalize inode slabcache\n");
+		       "Couldn't initialize inode slabcache\n");
 		return -ENOMEM;
 	}
 
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 1cc087635a5..a2e3b562e65 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
 
 /**
  * ecryptfs_init_crypt_ctx
- * @crypt_stat: Uninitilized crypt stats structure
+ * @crypt_stat: Uninitialized crypt stats structure
  *
  * Initialize the crypto context.
  *
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca..146f1f6a920 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2918,7 +2918,7 @@ fix_extent_len:
  * One of more index blocks maybe needed if the extent tree grow after
  * the unintialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitilized extent called at this time will be split
+ * the IO. The uninitialized extent called at this time will be split
  * into three uninitialized extent(at most). After IO complete, the part
  * being filled will be convert to initialized by the end_io callback function
  * via ext4_convert_unwritten_extents().
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d..8d7539c9d77 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3008,7 +3008,7 @@ no_journal:
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
-		ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
 			 err);
 		goto failed_mount4;
 	}
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e..5132c99b1ca 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -135,7 +135,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 /**
- * vxfs_read_super - read superblock into memory and initalize filesystem
+ * vxfs_read_super - read superblock into memory and initialize filesystem
  * @sbp:		VFS superblock (to fill)
  * @dp:			fs private mount data
  * @silent:		do not complain loudly when sth is wrong
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2c26ce251cb..812f10233b1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2476,7 +2476,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	kfree(osb->slot_recovery_generations);
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
-	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
+	 * allocate osb->journal at the start of ocfs2_initialize_osb(),
 	 * we free it here.
 	 */
 	kfree(osb->journal);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0f22fdaf54a..29db72203bd 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1221,7 +1221,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		inode_set_bytes(inode,
 				to_real_used_space(inode, inode->i_blocks,
 						   SD_V2_SIZE));
-		/* read persistent inode attributes from sd and initalise
+		/* read persistent inode attributes from sd and initialise
 		   generic inode flags from them */
 		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
 		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
diff --git a/lib/random32.c b/lib/random32.c
index 217d5c4b666..556d5ffe110 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -131,7 +131,7 @@ core_initcall(random32_init);
 
 /*
  *	Generate better values after random number generator
- *	is fully initalized.
+ *	is fully initialized.
  */
 static int __init random32_reseed(void)
 {
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 94a45213faa..9323f894419 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -11,7 +11,7 @@
  * Changes:
  *     Martin Hamilton         :    fixed the terrible locking bugs
  *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
- *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    fixed the uninitialized tbl->lock bug
  *     Wensong Zhang           :    added doing full expiration check to
  *                                   collect stale entries of 24+ hours when
  *                                   no partial expire check in a half hour
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 535dc2b419d..dbeed8ea421 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -386,7 +386,7 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
 		ip_vs_addr_copy(dest->af, &en->addr, daddr);
 		en->lastuse = jiffies;
 
-		/* initilize its dest set */
+		/* initialize its dest set */
 		atomic_set(&(en->set.size), 0);
 		INIT_LIST_HEAD(&en->set.list);
 		rwlock_init(&en->set.lock);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e41feff19e4..0b85e525643 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -172,7 +172,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
 		(unsigned long)sp->autoclose * HZ;
 
-	/* Initilizes the timers */
+	/* Initializes the timers */
 	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
 		setup_timer(&asoc->timers[i], sctp_timer_events[i],
 				(unsigned long)asoc);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 182749867c7..0f41b05bd4d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1162,7 +1162,7 @@ SCTP_STATIC __init int sctp_init(void)
 	/* Set the pressure threshold to be a fraction of global memory that
 	 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
 	 * memory, with a floor of 128 pages.
-	 * Note this initalizes the data in sctpv6_prot too
+	 * Note this initializes the data in sctpv6_prot too
 	 * Unabashedly stolen from tcp_init
 	 */
 	nr_pages = totalram_pages - totalhigh_pages;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 0f2fc480fc6..276bdc7325e 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3227,7 +3227,7 @@ static __init int smack_init(void)
 	cred = (struct cred *) current->cred;
 	cred->security = &smack_known_floor.smk_known;
 
-	/* initilize the smack_know_list */
+	/* initialize the smack_know_list */
 	init_smack_know_list();
 	/*
 	 * Initialize locks
diff --git a/sound/pci/trident/trident_main.c b/sound/pci/trident/trident_main.c
index 6d943f6f6b7..2870a4fdc13 100644
--- a/sound/pci/trident/trident_main.c
+++ b/sound/pci/trident/trident_main.c
@@ -1055,7 +1055,7 @@ static int snd_trident_capture_prepare(struct snd_pcm_substream *substream)
 
 	spin_lock_irq(&trident->reg_lock);
 
-	// Initilize the channel and set channel Mode
+	// Initialize the channel and set channel Mode
 	outb(0, TRID_REG(trident, LEGACY_DMAR15));
 
 	// Set DMA channel operation mode register
diff --git a/sound/soc/fsl/mpc8610_hpcd.c b/sound/soc/fsl/mpc8610_hpcd.c
index 83de1c81c8c..604a91fa31b 100644
--- a/sound/soc/fsl/mpc8610_hpcd.c
+++ b/sound/soc/fsl/mpc8610_hpcd.c
@@ -46,7 +46,7 @@ struct mpc8610_hpcd_data {
 };
 
 /**
- * mpc8610_hpcd_machine_probe: initalize the board
+ * mpc8610_hpcd_machine_probe: initialize the board
  *
  * This function is called when platform_device_add() is called.  It is used
  * to initialize the board-specific hardware.
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 998569d6033..e048e091009 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1307,7 +1307,7 @@ cpu_dai_err:
 }
 
 /*
- * Attempt to initialise any uninitalised cards.  Must be called with
+ * Attempt to initialise any uninitialised cards.  Must be called with
  * client_mutex.
  */
 static void snd_soc_instantiate_cards(void)
-- 
cgit v1.2.3-70-g09d2


From 23016bf0d25d62c45d8b8f61d55b290d704f7a79 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Thu, 3 Jun 2010 23:22:28 -0400
Subject: x86: Look for IA32_ENERGY_PERF_BIAS support

The new IA32_ENERGY_PERF_BIAS MSR allows system software to give
hardware a hint whether OS policy favors more power saving,
or more performance.  This allows the OS to have some influence
on internal hardware power/performance tradeoffs where the OS
has previously had no influence.

The support for this feature is indicated by CPUID.06H.ECX.bit3,
as documented in the Intel Architectures Software Developer's Manual.

This patch discovers support of this feature and displays it
as "epb" in /proc/cpuinfo.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
LKML-Reference: <alpine.LFD.2.00.1006032310160.6669@localhost.localdomain>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/include/asm/msr-index.h           | 2 ++
 arch/x86/kernel/cpu/addon_cpuid_features.c | 1 +
 3 files changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 46814591438..2a904f4071f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -162,6 +162,7 @@
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 
 /* Virtualization flags: Linux defined */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b49d8ca228f..e57bc20683d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -238,6 +238,8 @@
 
 #define MSR_IA32_TEMPERATURE_TARGET	0x000001a2
 
+#define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 10fa5684a66..7369b4c2c55 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -33,6 +33,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
 		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
+		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006 },
 		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
 		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
 		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
-- 
cgit v1.2.3-70-g09d2


From c882e0feb937af4e5b991cbd1c81536f37053e86 Mon Sep 17 00:00:00 2001
From: Robert Schöne <robert.schoene@tu-dresden.de>
Date: Mon, 14 Jun 2010 13:37:20 +0200
Subject: x86, perf: Add power_end event to process_*.c cpu_idle routine

Systems using the idle thread from process_32.c and process_64.c
do not generate power_end events which could be traced using
perf. This patch adds the event generation for such systems.

Signed-off-by: Robert Schoene <robert.schoene@tu-dresden.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1276515440.5441.45.camel@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c | 4 ++++
 arch/x86/kernel/process_64.c | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..96586c3cbbb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 
+#include <trace/events/power.h>
+
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 /*
@@ -111,6 +113,8 @@ void cpu_idle(void)
 			stop_critical_timings();
 			pm_idle();
 			start_critical_timings();
+
+			trace_power_end(smp_processor_id());
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1..3d9ea531ddd 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 
+#include <trace/events/power.h>
+
 asmlinkage extern void ret_from_fork(void);
 
 DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
 			stop_critical_timings();
 			pm_idle();
 			start_critical_timings();
+
+			trace_power_end(smp_processor_id());
+
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
-- 
cgit v1.2.3-70-g09d2


From fd699c76552bbfa66631f019be415a87dbb08237 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Fri, 18 Jun 2010 17:46:53 -0400
Subject: x86, olpc: Add support for calling into OpenFirmware

Add support for saving OFW's cif, and later calling into it to run OFW
commands.  OFW remains resident in memory, living within virtual range
0xff800000 - 0xffc00000.  A single page directory entry points to the
pgdir that OFW actually uses, so rather than saving the entire page
table, we grab and install that one entry permanently in the kernel's
page table.

This is currently only used by the OLPC XO.  Note that this particular
calling convention breaks PAE and PAT, and so cannot be used on newer
x86 hardware.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20100618174653.7755a39a@dev.queued.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/x86/zero-page.txt  |   1 +
 arch/x86/Kconfig                 |   9 ++++
 arch/x86/include/asm/bootparam.h |  11 ++++-
 arch/x86/include/asm/olpc_ofw.h  |  31 ++++++++++++
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/head_32.S        |   6 +++
 arch/x86/kernel/olpc.c           |  12 ++---
 arch/x86/kernel/olpc_ofw.c       | 104 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup.c          |   6 +++
 9 files changed, 173 insertions(+), 8 deletions(-)
 create mode 100644 arch/x86/include/asm/olpc_ofw.h
 create mode 100644 arch/x86/kernel/olpc_ofw.c

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index feb37e17701..cf5437deda8 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -18,6 +18,7 @@ Offset	Proto	Name		Meaning
 080/010	ALL	hd0_info	hd0 disk parameter, OBSOLETE!!
 090/010	ALL	hd1_info	hd1 disk parameter, OBSOLETE!!
 0A0/010	ALL	sys_desc_table	System description table (struct sys_desc_table)
+0B0/010	ALL	olpc_ofw_header	OLPC's OpenFirmware CIF and friends
 140/080	ALL	edid_info	Video mode setup (struct edid_info)
 1C0/020	ALL	efi_info	EFI 32 information (struct efi_info)
 1E0/004	ALL	alk_mem_k	Alternative mem check, in KB
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a6..71c194db2e0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2062,6 +2062,15 @@ config OLPC
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
 
+config OLPC_OPENFIRMWARE
+	bool "Support for OLPC's Open Firmware"
+	depends on !X86_64 && !X86_PAE
+	default y if OLPC
+	help
+	  This option adds support for the implementation of Open Firmware
+	  that is used on the OLPC XO-1 Children's Machine.
+	  If unsure, say N here.
+
 endif # X86_32
 
 config K8_NB
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d83c71..8e6218550e7 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -70,6 +70,14 @@ struct sys_desc_table {
 	__u8  table[14];
 };
 
+/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
+struct olpc_ofw_header {
+	__u32 ofw_magic;	/* OFW signature */
+	__u32 ofw_version;
+	__u32 cif_handler;	/* callback into OFW */
+	__u32 irq_desc_table;
+} __attribute__((packed));
+
 struct efi_info {
 	__u32 efi_loader_signature;
 	__u32 efi_systab;
@@ -92,7 +100,8 @@ struct boot_params {
 	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
 	__u8  hd1_info[16];	/* obsolete! */		/* 0x090 */
 	struct sys_desc_table sys_desc_table;		/* 0x0a0 */
-	__u8  _pad4[144];				/* 0x0b0 */
+	struct olpc_ofw_header olpc_ofw_header;		/* 0x0b0 */
+	__u8  _pad4[128];				/* 0x0c0 */
 	struct edid_info edid_info;			/* 0x140 */
 	struct efi_info efi_info;			/* 0x1c0 */
 	__u32 alt_mem_k;				/* 0x1e0 */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 00000000000..3e63d857c48
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_OLPC_OFW_H
+#define _ASM_X86_OLPC_OFW_H
+
+/* index into the page table containing the entry OFW occupies */
+#define OLPC_OFW_PDE_NR 1022
+
+#define OLPC_OFW_SIG 0x2057464F	/* aka "OFW " */
+
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+
+/* run an OFW command by calling into the firmware */
+#define olpc_ofw(name, args, res) \
+	__olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
+
+extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+		void **res);
+
+/* determine whether OFW is available and lives in the proper memory */
+extern void olpc_ofw_detect(void);
+
+/* install OFW's pde permanently into the kernel's pgtable */
+extern void setup_olpc_ofw_pgd(void);
+
+#else /* !CONFIG_OLPC_OPENFIRMWARE */
+
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+
+#endif /* !CONFIG_OLPC_OPENFIRMWARE */
+
+#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b2208372..0925676266b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
 
 obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 
 microcode-y				:= microcode_core.o
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d8..ff4c453e13f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
 	movsl
 1:
 
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+	/* save OFW's pgdir table for later use when calling into OFW */
+	movl %cr3, %eax
+	movl %eax, pa(olpc_ofw_pgd)
+#endif
+
 #ifdef CONFIG_PARAVIRT
 	/* This is can only trip for a broken bootloader... */
 	cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b..156605281f5 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
 #include <asm/geode.h>
 #include <asm/setup.h>
 #include <asm/olpc.h>
-
-#ifdef CONFIG_OPEN_FIRMWARE
-#include <asm/ofw.h>
-#endif
+#include <asm/olpc_ofw.h>
 
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -188,14 +185,15 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
-#ifdef CONFIG_OPEN_FIRMWARE
+#ifdef CONFIG_OLPC_OPENFIRMWARE
 static void __init platform_detect(void)
 {
 	size_t propsize;
 	__be32 rev;
+	void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+	void *res[] = { &propsize };
 
-	if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
-			&propsize) || propsize != 4) {
+	if (olpc_ofw("getprop", args, res) || propsize != 4) {
 		printk(KERN_ERR "ofw: getprop call failed!\n");
 		rev = cpu_to_be32(0);
 	}
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 00000000000..469ee438429
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,104 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+	pgd_t *base, *ofw_pde;
+
+	if (!olpc_ofw_cif)
+		return;
+
+	/* fetch OFW's PDE */
+	base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+	if (!base) {
+		printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+		olpc_ofw_cif = NULL;
+		return;
+	}
+	ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+	/* install OFW's PDE permanently into the kernel's pgtable */
+	set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+		void **res)
+{
+	int ofw_args[MAXARGS + 3];
+	unsigned long flags;
+	int ret, i, *p;
+
+	BUG_ON(nr_args + nr_res > MAXARGS);
+
+	if (!olpc_ofw_cif)
+		return -EIO;
+
+	ofw_args[0] = (int)name;
+	ofw_args[1] = nr_args;
+	ofw_args[2] = nr_res;
+
+	p = &ofw_args[3];
+	for (i = 0; i < nr_args; i++, p++)
+		*p = (int)args[i];
+
+	/* call into ofw */
+	spin_lock_irqsave(&ofw_lock, flags);
+	ret = olpc_ofw_cif(ofw_args);
+	spin_unlock_irqrestore(&ofw_lock, flags);
+
+	if (!ret) {
+		for (i = 0; i < nr_res; i++, p++)
+			*((int *)res[i]) = *p;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+	struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+	unsigned long start;
+
+	/* ensure OFW booted us by checking for "OFW " string */
+	if (hdr->ofw_magic != OLPC_OFW_SIG)
+		return;
+
+	olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+	if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+		printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+				(unsigned long)olpc_ofw_cif);
+		olpc_ofw_cif = NULL;
+		return;
+	}
+
+	/* determine where OFW starts in memory */
+	start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+	printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+			(unsigned long)olpc_ofw_cif, (-start) >> 20);
+	reserve_top_address(-start);
+}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd03..b008e788320 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
 
 #include <asm/paravirt.h>
 #include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
 
 #include <asm/percpu.h>
 #include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
 	vmi_init();
 
+	/* OFW also may relocate the fixmap */
+	olpc_ofw_detect();
+
 	early_trap_init();
 	early_cpu_init();
 	early_ioremap_init();
 
+	setup_olpc_ofw_pgd();
+
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
 	screen_info = boot_params.screen_info;
 	edid_info = boot_params.edid_info;
-- 
cgit v1.2.3-70-g09d2


From 75a9cac430a1bd2a5219c74508ca01b0ddfddc9a Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 23 Jun 2010 20:27:00 -0400
Subject: x86, olpc: Add comment about implicit optimization barrier

Signed-off-by: Andres Salomon <dilinger@queued.net>
Cc: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <20100618174653.7755a39a@dev.queued.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/olpc_ofw.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index 469ee438429..f5d499fbe74 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -35,6 +35,8 @@ void __init setup_olpc_ofw_pgd(void)
 
 	/* install OFW's PDE permanently into the kernel's pgtable */
 	set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+	/* implicit optimization barrier here due to uninline function return */
+
 	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0c4519e825c9e2b6a8310deff8582f8c35bfbba9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Jun 2010 21:21:27 +0200
Subject: x86: Set resume bit before returning from breakpoint exception

Instruction breakpoints trigger before the instruction executes,
and returning back from the breakpoint handler brings us again
to the instruction that breakpointed. This naturally bring to
a breakpoint recursion.

To solve this, x86 has the Resume Bit trick. When the cpu flags
have the RF flag set, the next instruction won't trigger any
instruction breakpoint, and once this instruction is executed,
RF is cleared back.

This let's us jump back to the instruction that triggered the
breakpoint without recursion.

Use this when an instruction breakpoint triggers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/hw_breakpoint.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2f..eaa6ae2a010 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -466,6 +466,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 
 		perf_bp_event(bp, args->regs);
 
+		/*
+		 * Set up resume flag to avoid breakpoint recursion when
+		 * returning back to origin.
+		 */
+		if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+			args->regs->flags |= X86_EFLAGS_RF;
+
 		rcu_read_unlock();
 	}
 	/*
-- 
cgit v1.2.3-70-g09d2


From f7809daf64bf119fef70af172db6a0636fa51f92 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Jun 2010 10:00:24 +0200
Subject: x86: Support for instruction breakpoints

Instruction breakpoints need to have a specific length of 0 to
be working. Bring this support but also take care the user is not
trying to set an unsupported length, like a range breakpoint for
example.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/hw_breakpoint.h |  2 +-
 arch/x86/kernel/hw_breakpoint.c      | 44 ++++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 942255310e6..528a11e8d3e 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint {
 #include <linux/list.h>
 
 /* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_X		0x00
 #define X86_BREAKPOINT_LEN_1		0x40
 #define X86_BREAKPOINT_LEN_2		0x44
 #define X86_BREAKPOINT_LEN_4		0x4c
-#define X86_BREAKPOINT_LEN_EXECUTE	0x40
 
 #ifdef CONFIG_X86_64
 #define X86_BREAKPOINT_LEN_8		0x48
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index eaa6ae2a010..a474ec37c32 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
 {
 	/* Len */
 	switch (x86_len) {
+	case X86_BREAKPOINT_LEN_X:
+		*gen_len = sizeof(long);
+		break;
 	case X86_BREAKPOINT_LEN_1:
 		*gen_len = HW_BREAKPOINT_LEN_1;
 		break;
@@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
 
 	info->address = bp->attr.bp_addr;
 
+	/* Type */
+	switch (bp->attr.bp_type) {
+	case HW_BREAKPOINT_W:
+		info->type = X86_BREAKPOINT_WRITE;
+		break;
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		info->type = X86_BREAKPOINT_RW;
+		break;
+	case HW_BREAKPOINT_X:
+		info->type = X86_BREAKPOINT_EXECUTE;
+		/*
+		 * x86 inst breakpoints need to have a specific undefined len.
+		 * But we still need to check userspace is not trying to setup
+		 * an unsupported length, to get a range breakpoint for example.
+		 */
+		if (bp->attr.bp_len == sizeof(long)) {
+			info->len = X86_BREAKPOINT_LEN_X;
+			return 0;
+		}
+	default:
+		return -EINVAL;
+	}
+
 	/* Len */
 	switch (bp->attr.bp_len) {
 	case HW_BREAKPOINT_LEN_1:
@@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp)
 		return -EINVAL;
 	}
 
-	/* Type */
-	switch (bp->attr.bp_type) {
-	case HW_BREAKPOINT_W:
-		info->type = X86_BREAKPOINT_WRITE;
-		break;
-	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
-		info->type = X86_BREAKPOINT_RW;
-		break;
-	case HW_BREAKPOINT_X:
-		info->type = X86_BREAKPOINT_EXECUTE;
-		break;
-	default:
-		return -EINVAL;
-	}
-
 	return 0;
 }
 /*
@@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	ret = -EINVAL;
 
 	switch (info->len) {
+	case X86_BREAKPOINT_LEN_X:
+		align = sizeof(long) -1;
+		break;
 	case X86_BREAKPOINT_LEN_1:
 		align = 0;
 		break;
-- 
cgit v1.2.3-70-g09d2


From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: increase max_active of keventd and kill
 current_is_keventd()

Define WQ_MAX_ACTIVE and create keventd with max_active set to half of
it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1
works concurrently.  Unless some combination can result in dependency
loop longer than max_active, deadlock won't happen and thus it's
unnecessary to check whether current_is_keventd() before trying to
schedule a work.  Kill current_is_keventd().

(Lockdep annotations are broken.  We need lock_map_acquire_read_norecurse())

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 arch/ia64/kernel/smpboot.c |  2 +-
 arch/x86/kernel/smpboot.c  |  2 +-
 include/linux/workqueue.h  |  4 ++-
 kernel/workqueue.c         | 63 ++++++++++------------------------------------
 4 files changed, 18 insertions(+), 53 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 6a1380e90f8..99dcc85193c 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -519,7 +519,7 @@ do_boot_cpu (int sapicid, int cpu)
 	/*
 	 * We can't use kernel_thread since we must avoid to reschedule the child.
 	 */
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d..4d90f376e98 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		goto do_rest;
 	}
 
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b8f4ec45c40..33e24e734d5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -227,6 +227,9 @@ enum {
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
+
+	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
 extern struct workqueue_struct *
@@ -280,7 +283,6 @@ extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay)
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 					unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
-extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ad46523b42..4190e84cf99 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2398,7 +2398,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
-	int orig = -1;
 	struct work_struct *works;
 
 	works = alloc_percpu(struct work_struct);
@@ -2407,23 +2406,12 @@ int schedule_on_each_cpu(work_func_t func)
 
 	get_online_cpus();
 
-	/*
-	 * When running in keventd don't schedule a work item on
-	 * itself.  Can just call directly because the work queue is
-	 * already bound.  This also is faster.
-	 */
-	if (current_is_keventd())
-		orig = raw_smp_processor_id();
-
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
-		if (cpu != orig)
-			schedule_work_on(cpu, work);
+		schedule_work_on(cpu, work);
 	}
-	if (orig >= 0)
-		func(per_cpu_ptr(works, orig));
 
 	for_each_online_cpu(cpu)
 		flush_work(per_cpu_ptr(works, cpu));
@@ -2494,41 +2482,6 @@ int keventd_up(void)
 	return keventd_wq != NULL;
 }
 
-int current_is_keventd(void)
-{
-	bool found = false;
-	unsigned int cpu;
-
-	/*
-	 * There no longer is one-to-one relation between worker and
-	 * work queue and a worker task might be unbound from its cpu
-	 * if the cpu was offlined.  Match all busy workers.  This
-	 * function will go away once dynamic pool is implemented.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
-		struct worker *worker;
-		struct hlist_node *pos;
-		unsigned long flags;
-		int i;
-
-		spin_lock_irqsave(&gcwq->lock, flags);
-
-		for_each_busy_worker(worker, i, pos, gcwq) {
-			if (worker->task == current) {
-				found = true;
-				break;
-			}
-		}
-
-		spin_unlock_irqrestore(&gcwq->lock, flags);
-		if (found)
-			break;
-	}
-
-	return found;
-}
-
 static struct cpu_workqueue_struct *alloc_cwqs(void)
 {
 	/*
@@ -2576,6 +2529,16 @@ static void free_cwqs(struct cpu_workqueue_struct *cwqs)
 #endif
 }
 
+static int wq_clamp_max_active(int max_active, const char *name)
+{
+	if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
+		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+		       "is out of range, clamping between %d and %d\n",
+		       max_active, name, 1, WQ_MAX_ACTIVE);
+
+	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
+}
+
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						unsigned int flags,
 						int max_active,
@@ -2585,7 +2548,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
-	max_active = clamp_val(max_active, 1, INT_MAX);
+	max_active = wq_clamp_max_active(max_active, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
@@ -3324,6 +3287,6 @@ void __init init_workqueues(void)
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-	keventd_wq = create_workqueue("events");
+	keventd_wq = __create_workqueue("events", 0, WQ_DFL_ACTIVE);
 	BUG_ON(!keventd_wq);
 }
-- 
cgit v1.2.3-70-g09d2


From 567a9fd86735ccdc897768ed2dacdd5e83a13509 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Tue, 29 Jun 2010 14:53:50 +0900
Subject: kprobes/x86: Fix kprobes to skip prefixes correctly

Fix resume_execution() and is_IF_modifier() to skip x86
instruction prefixes correctly by using x86 instruction
attribute.

Without this fix, resume_execution() can't handle instructions
which have non-REX prefixes (REX prefixes are skipped). This
will cause unexpected kernel panic by hitting bad address when a
kprobe hits on two-byte ret (e.g. "repz ret" generated for
Athlon/K8 optimization), because it just checks "repz" and can't
recognize the "ret" instruction.

These prefixes can be found easily with x86 instruction
attribute. This patch introduces skip_prefixes() and uses it in
resume_execution() and is_IF_modifier() to skip prefixes.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
LKML-Reference: <4C298A6E.8070609@hitachi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 345a4b1fe14..175f85ceace 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
 }
 
 /*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
+ * Skip the prefixes of the instruction.
  */
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
 {
+	insn_attr_t attr;
+
+	attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+	while (inat_is_legacy_prefix(attr)) {
+		insn++;
+		attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+	}
 #ifdef CONFIG_X86_64
-	if ((*insn & 0xf0) == 0x40)
-		return 1;
+	if (inat_is_rex_prefix(attr))
+		insn++;
 #endif
-	return 0;
+	return insn;
 }
 
 /*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
  */
 static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 {
+	/* Skip prefixes */
+	insn = skip_prefixes(insn);
+
 	switch (*insn) {
 	case 0xfa:		/* cli */
 	case 0xfb:		/* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 		return 1;
 	}
 
-	/*
-	 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
-	 * at the next byte instead.. but of course not recurse infinitely
-	 */
-	if (is_REX_prefix(insn))
-		return is_IF_modifier(++insn);
-
 	return 0;
 }
 
@@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 	unsigned long orig_ip = (unsigned long)p->addr;
 	kprobe_opcode_t *insn = p->ainsn.insn;
 
-	/*skip the REX prefix*/
-	if (is_REX_prefix(insn))
-		insn++;
+	/* Skip prefixes */
+	insn = skip_prefixes(insn);
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	switch (*insn) {
-- 
cgit v1.2.3-70-g09d2


From 39ef13a4ac28aa64cfe1bc36e6e00f1096707a28 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 5 Jul 2010 10:09:29 +0800
Subject: perf, x86: P4 PMU -- redesign cache events

To support cache events we have reserved the low 6 bits in
hw_perf_event::config (which is a part of CCCR register
configuration actually).

These bits represent Replay Event mertic enumerated in
enum P4_PEBS_METRIC. The caller should not care about
which exact bits should be set and how -- the caller
just chooses one P4_PEBS_METRIC entity and puts it into
the config. The kernel will track it and set appropriate
additional MSR registers (metrics) when needed.

The reason for this redesign was the PEBS enable bit, which
should not be set until DS (and PEBS sampling) support will
be implemented properly.

TODO
====

 - PEBS sampling (note it's tricky and works with _one_ counter only
   so for HT machines it will be not that easy to handle both threads)

 - tracking of PEBS registers state, a user might need to turn
   PEBS off completely (ie no PEBS enable, no UOP_tag) but some
   other event may need it, such events clashes and should not
   run simultaneously, at moment we just don't support such events

 - eventually export user space bits in separate header which will
   allow user apps to configure raw events more conveniently.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1278295769.9540.15.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h |  99 ++++++++++++-----------
 arch/x86/kernel/cpu/perf_event_p4.c  | 147 ++++++++++++++++++++++++++---------
 2 files changed, 163 insertions(+), 83 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 64a8ebff06f..def500776b1 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -19,7 +19,6 @@
 #define ARCH_P4_RESERVED_ESCR	(2) /* IQ_ESCR(0,1) not always present */
 #define ARCH_P4_MAX_ESCR	(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
 #define ARCH_P4_MAX_CCCR	(18)
-#define ARCH_P4_MAX_COUNTER	(ARCH_P4_MAX_CCCR / 2)
 
 #define P4_ESCR_EVENT_MASK	0x7e000000U
 #define P4_ESCR_EVENT_SHIFT	25
@@ -71,10 +70,6 @@
 #define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
 
-/* Custom bits in reerved CCCR area */
-#define P4_CCCR_CACHE_OPS_MASK		0x0000003fU
-
-
 /* Non HT mask */
 #define P4_CCCR_MASK				\
 	(P4_CCCR_OVF			|	\
@@ -106,8 +101,7 @@
  * ESCR and CCCR but rather an only packed value should
  * be unpacked and written to a proper addresses
  *
- * the base idea is to pack as much info as
- * possible
+ * the base idea is to pack as much info as possible
  */
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
@@ -130,8 +124,6 @@
 		t;					\
 	})
 
-#define p4_config_unpack_cache_event(v)	(((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
-
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
@@ -214,6 +206,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 	return escr;
 }
 
+/*
+ * This are the events which should be used in "Event Select"
+ * field of ESCR register, they are like unique keys which allow
+ * the kernel to determinate which CCCR and COUNTER should be
+ * used to track an event
+ */
 enum P4_EVENTS {
 	P4_EVENT_TC_DELIVER_MODE,
 	P4_EVENT_BPU_FETCH_REQUEST,
@@ -561,7 +559,7 @@ enum P4_EVENT_OPCODES {
  * a caller should use P4_ESCR_EMASK_NAME helper to
  * pick the EventMask needed, for example
  *
- *	P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
+ *	P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
  */
 enum P4_ESCR_EMASKS {
 	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
@@ -753,43 +751,50 @@ enum P4_ESCR_EMASKS {
 	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
 };
 
-/* P4 PEBS: stale for a while */
-#define P4_PEBS_METRIC_MASK	0x00001fffU
-#define P4_PEBS_UOB_TAG		0x01000000U
-#define P4_PEBS_ENABLE		0x02000000U
-
-/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
-#define P4_PEBS__1stl_cache_load_miss_retired	0x3000001
-#define P4_PEBS__2ndl_cache_load_miss_retired	0x3000002
-#define P4_PEBS__dtlb_load_miss_retired		0x3000004
-#define P4_PEBS__dtlb_store_miss_retired	0x3000004
-#define P4_PEBS__dtlb_all_miss_retired		0x3000004
-#define P4_PEBS__tagged_mispred_branch		0x3018000
-#define P4_PEBS__mob_load_replay_retired	0x3000200
-#define P4_PEBS__split_load_retired		0x3000400
-#define P4_PEBS__split_store_retired		0x3000400
-
-#define P4_VERT__1stl_cache_load_miss_retired	0x0000001
-#define P4_VERT__2ndl_cache_load_miss_retired	0x0000001
-#define P4_VERT__dtlb_load_miss_retired		0x0000001
-#define P4_VERT__dtlb_store_miss_retired	0x0000002
-#define P4_VERT__dtlb_all_miss_retired		0x0000003
-#define P4_VERT__tagged_mispred_branch		0x0000010
-#define P4_VERT__mob_load_replay_retired	0x0000001
-#define P4_VERT__split_load_retired		0x0000001
-#define P4_VERT__split_store_retired		0x0000002
-
-enum P4_CACHE_EVENTS {
-	P4_CACHE__NONE,
-
-	P4_CACHE__1stl_cache_load_miss_retired,
-	P4_CACHE__2ndl_cache_load_miss_retired,
-	P4_CACHE__dtlb_load_miss_retired,
-	P4_CACHE__dtlb_store_miss_retired,
-	P4_CACHE__itlb_reference_hit,
-	P4_CACHE__itlb_reference_miss,
-
-	P4_CACHE__MAX
+/*
+ * P4 PEBS specifics (Replay Event only)
+ *
+ * Format (bits):
+ *   0-6: metric from P4_PEBS_METRIC enum
+ *    7 : reserved
+ *    8 : reserved
+ * 9-11 : reserved
+ *
+ * Note we have UOP and PEBS bits reserved for now
+ * just in case if we will need them once
+ */
+#define P4_PEBS_CONFIG_ENABLE		(1 << 7)
+#define P4_PEBS_CONFIG_UOP_TAG		(1 << 8)
+#define P4_PEBS_CONFIG_METRIC_MASK	0x3f
+#define P4_PEBS_CONFIG_MASK		0xff
+
+/*
+ * mem: Only counters MSR_IQ_COUNTER4 (16) and
+ * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
+ */
+#define P4_PEBS_ENABLE			0x02000000U
+#define P4_PEBS_ENABLE_UOP_TAG		0x01000000U
+
+#define p4_config_unpack_metric(v)	(((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
+#define p4_config_unpack_pebs(v)	(((u64)(v)) & P4_PEBS_CONFIG_MASK)
+
+#define p4_config_pebs_has(v, mask)	(p4_config_unpack_pebs(v) & (mask))
+
+enum P4_PEBS_METRIC {
+	P4_PEBS_METRIC__none,
+
+	P4_PEBS_METRIC__1stl_cache_load_miss_retired,
+	P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
+	P4_PEBS_METRIC__dtlb_load_miss_retired,
+	P4_PEBS_METRIC__dtlb_store_miss_retired,
+	P4_PEBS_METRIC__dtlb_all_miss_retired,
+	P4_PEBS_METRIC__tagged_mispred_branch,
+	P4_PEBS_METRIC__mob_load_replay_retired,
+	P4_PEBS_METRIC__split_load_retired,
+	P4_PEBS_METRIC__split_store_retired,
+
+	P4_PEBS_METRIC__max
 };
 
 #endif /* PERF_EVENT_P4_H */
+
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 9286e736a70..107711bf0ee 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
 	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
-struct p4_cache_event_bind {
+struct p4_pebs_bind {
 	unsigned int metric_pebs;
 	unsigned int metric_vert;
 };
 
-#define P4_GEN_CACHE_EVENT_BIND(name)		\
-	[P4_CACHE__##name] = {			\
-		.metric_pebs = P4_PEBS__##name,	\
-		.metric_vert = P4_VERT__##name,	\
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)			\
+	[P4_PEBS_METRIC__##name] = {				\
+		.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,	\
+		.metric_vert = vert,				\
 	}
 
-static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
-	P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
-	P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
-	P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
-	P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+	P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,	0x0000001, 0x0000001),
+	P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,	0x0000002, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_load_miss_retired,	0x0000004, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_store_miss_retired,	0x0000004, 0x0000002),
+	P4_GEN_PEBS_BIND(dtlb_all_miss_retired,		0x0000004, 0x0000003),
+	P4_GEN_PEBS_BIND(tagged_mispred_branch,		0x0018000, 0x0000010),
+	P4_GEN_PEBS_BIND(mob_load_replay_retired,	0x0000200, 0x0000001),
+	P4_GEN_PEBS_BIND(split_load_retired,		0x0000400, 0x0000001),
+	P4_GEN_PEBS_BIND(split_store_retired,		0x0000400, 0x0000002),
 };
 
 /*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	},
 };
 
-#define P4_GEN_CACHE_EVENT(event, bit, cache_event)			  \
+#define P4_GEN_CACHE_EVENT(event, bit, metric)				  \
 	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
 			    P4_ESCR_EMASK_BIT(event, bit))		| \
-	p4_config_pack_cccr(cache_event					| \
+	p4_config_pack_cccr(metric					| \
 			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
 static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__1stl_cache_load_miss_retired),
+						P4_PEBS_METRIC__1stl_cache_load_miss_retired),
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__2ndl_cache_load_miss_retired),
+						P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
 	},
 },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__dtlb_load_miss_retired),
+						P4_PEBS_METRIC__dtlb_load_miss_retired),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_CACHE__dtlb_store_miss_retired),
+						P4_PEBS_METRIC__dtlb_store_miss_retired),
 	},
  },
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-						P4_CACHE__itlb_reference_hit),
+						P4_PEBS_METRIC__none),
 		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-						P4_CACHE__itlb_reference_miss),
+						P4_PEBS_METRIC__none),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
+static int p4_validate_raw_event(struct perf_event *event)
+{
+	unsigned int v;
+
+	/* user data may have out-of-bound event index */
+	v = p4_config_unpack_event(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_event_bind_map)) {
+		pr_warning("P4 PMU: Unknown event code: %d\n", v);
+		return -EINVAL;
+	}
+
+	/*
+	 * it may have some screwed PEBS bits
+	 */
+	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
+		pr_warning("P4 PMU: PEBS are not supported yet\n");
+		return -EINVAL;
+	}
+	v = p4_config_unpack_metric(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
+		pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
 	int rc = 0;
-	unsigned int evnt;
 	u32 escr, cccr;
 
 	/*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
 
 	if (event->attr.type == PERF_TYPE_RAW) {
 
-		/* user data may have out-of-bound event index */
-		evnt = p4_config_unpack_event(event->attr.config);
-		if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
-			rc = -EINVAL;
+		rc = p4_validate_raw_event(event);
+		if (rc)
 			goto out;
-		}
 
 		/*
 		 * We don't control raw events so it's up to the caller
@@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event)
 		 * on HT machine but allow HT-compatible specifics to be
 		 * passed on)
 		 *
+		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+		 * bits since we keep additional info here (for cache events and etc)
+		 *
 		 * XXX: HT wide things should check perf_paranoid_cpu() &&
 		 *      CAP_SYS_ADMIN
 		 */
 		event->hw.config |= event->attr.config &
 			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+			 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
 	}
 
 	rc = x86_setup_perfctr(event);
@@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 	return overflow;
 }
 
+static void p4_pmu_disable_pebs(void)
+{
+	/*
+	 * FIXME
+	 *
+	 * It's still allowed that two threads setup same cache
+	 * events so we can't simply clear metrics until we knew
+	 * noone is depending on us, so we need kind of counter
+	 * for "ReplayEvent" users.
+	 *
+	 * What is more complex -- RAW events, if user (for some
+	 * reason) will pass some cache event metric with improper
+	 * event opcode -- it's fine from hardware point of view
+	 * but completely nonsence from "meaning" of such action.
+	 *
+	 * So at moment let leave metrics turned on forever -- it's
+	 * ok for now but need to be revisited!
+	 *
+	 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
+	 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+	 */
+}
+
 static inline void p4_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void)
 			continue;
 		p4_pmu_disable_event(event);
 	}
+
+	p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+	struct p4_pebs_bind *bind;
+	unsigned int idx;
+
+	BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+	idx = p4_config_unpack_metric(config);
+	if (idx == P4_PEBS_METRIC__none)
+		return;
+
+	bind = &p4_pebs_bind_map[idx];
+
+	(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
+	(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
 }
 
 static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	int thread = p4_ht_config_thread(hwc->config);
 	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
 	unsigned int idx = p4_config_unpack_event(hwc->config);
-	unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
 	struct p4_event_bind *bind;
-	struct p4_cache_event_bind *bind_cache;
 	u64 escr_addr, cccr;
 
 	bind = &p4_event_bind_map[idx];
@@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	cccr = p4_config_unpack_cccr(hwc->config);
 
 	/*
-	 * it could be Cache event so that we need to
-	 * set metrics into additional MSRs
+	 * it could be Cache event so we need to write metrics
+	 * into additional MSRs
 	 */
-	BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
-	if (idx_cache > P4_CACHE__NONE &&
-		idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
-		bind_cache = &p4_cache_event_bind_map[idx_cache];
-		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
-		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
-	}
+	p4_pmu_enable_pebs(hwc->config);
 
 	(void)checking_wrmsrl(escr_addr, escr_conf);
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-- 
cgit v1.2.3-70-g09d2


From 8e221b6db4477643fefc885a97ea9889ac733140 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 22 Jun 2010 16:23:37 -0700
Subject: x86: Avoid unnecessary __clear_user() and xrstor in signal handling

fxsave/xsave doesn't touch all the bytes in the memory layout used by
these instructions. Specifically SW reserved (bytes 464..511) fields
in the fxsave frame and the reserved fields in the xsave header.

To present a clean context for the signal handling, just clear these fields
instead of clearing the complete fxsave/xsave memory layout, when we dump these
registers directly to the user signal frame.

Also avoid the call to second xrstor (which inits the state not passed
in the signal frame) in restore_user_xstate() if all the state has already
been restored by the first xrstor.

These changes improve the performance of signal handling(by ~3-5% as measured
by the lat_sig).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1277249017.2847.85.camel@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  |  9 +++++++++
 arch/x86/include/asm/xsave.h | 10 ++++++++++
 arch/x86/kernel/xsave.c      | 12 ++----------
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..0f1cf5d53dd 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -127,6 +127,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
 
+	/*
+	 * Clear the bytes not touched by the fxsave and reserved
+	 * for the SW usage.
+	 */
+	err = __clear_user(&fx->sw_reserved,
+			   sizeof(struct _fpx_sw_bytes));
+	if (unlikely(err))
+		return -EFAULT;
+
 	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..30dfc81804d 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -59,6 +59,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu)
 static inline int xsave_user(struct xsave_struct __user *buf)
 {
 	int err;
+
+	/*
+	 * Clear the xsave header first, so that reserved fields are
+	 * initialized to zero.
+	 */
+	err = __clear_user(&buf->xsave_hdr,
+			   sizeof(struct xsave_hdr_struct));
+	if (unlikely(err))
+		return -EFAULT;
+
 	__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
 			     "2:\n"
 			     ".section .fixup,\"ax\"\n"
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24..6e73db1b7b4 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -91,14 +91,6 @@ int save_i387_xstate(void __user *buf)
 		return 0;
 
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		/*
-	 	 * Start with clearing the user buffer. This will present a
-	 	 * clean context for the bytes not touched by the fxsave/xsave.
-		 */
-		err = __clear_user(buf, sig_xstate_size);
-		if (err)
-			return err;
-
 		if (use_xsave())
 			err = xsave_user(buf);
 		else
@@ -184,8 +176,8 @@ static int restore_user_xstate(void __user *buf)
 	 * init the state skipped by the user.
 	 */
 	mask = pcntxt_mask & ~mask;
-
-	xrstor_state(init_xstate_buf, mask);
+	if (unlikely(mask))
+		xrstor_state(init_xstate_buf, mask);
 
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 83a7a2ad2a9173dcabc05df0f01d1d85b7ba1c2c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 10 Jun 2010 00:10:43 +0000
Subject: x86, alternatives: Use 16-bit numbers for cpufeature index

We already have cpufeature indicies above 255, so use a 16-bit number
for the alternatives index.  This consumes a padding field and so
doesn't add any size, but it means that abusing the padding field to
create assembly errors on overflow no longer works.  We can retain the
test simply by redirecting it to the .discard section, however.

[ v3: updated to include open-coded locations ]

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <tip-f88731e3068f9d1392ba71cc9f50f035d26a0d4f@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative.h |  7 ++++---
 arch/x86/include/asm/cpufeature.h  | 14 ++++++++------
 arch/x86/kernel/entry_32.S         |  2 +-
 arch/x86/lib/clear_page_64.S       |  2 +-
 arch/x86/lib/copy_page_64.S        |  2 +-
 arch/x86/lib/memcpy_64.S           |  2 +-
 arch/x86/lib/memset_64.S           |  2 +-
 7 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 03b6bb5394a..bc6abb7bc7e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -45,10 +45,9 @@
 struct alt_instr {
 	u8 *instr;		/* original instruction */
 	u8 *replacement;
-	u8  cpuid;		/* cpuid bit set for replacement */
+	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen;	/* length of new instruction, <= instrlen */
-	u8  pad1;
 #ifdef CONFIG_X86_64
 	u32 pad2;
 #endif
@@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
       _ASM_ALIGN "\n"							\
       _ASM_PTR "661b\n"				/* label           */	\
       _ASM_PTR "663f\n"				/* new instruction */	\
-      "	 .byte " __stringify(feature) "\n"	/* feature bit     */	\
+      "	 .word " __stringify(feature) "\n"	/* feature bit     */	\
       "	 .byte 662b-661b\n"			/* sourcelen       */	\
       "	 .byte 664f-663f\n"			/* replacementlen  */	\
+      ".previous\n"							\
+      ".section .discard,\"aw\",@progbits\n"				\
       "	 .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */	\
       ".previous\n"							\
       ".section .altinstr_replacement, \"ax\"\n"			\
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 46814591438..e8b88967de3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -291,7 +291,7 @@ extern const char * const x86_power_flags[32];
  * patch the target code for additional performance.
  *
  */
-static __always_inline __pure bool __static_cpu_has(u8 bit)
+static __always_inline __pure bool __static_cpu_has(u16 bit)
 {
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
 		asm goto("1: jmp %l[t_no]\n"
@@ -300,11 +300,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 			 _ASM_ALIGN "\n"
 			 _ASM_PTR "1b\n"
 			 _ASM_PTR "0\n" 	/* no replacement */
-			 " .byte %P0\n"		/* feature bit */
+			 " .word %P0\n"		/* feature bit */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
-			 " .byte 0xff + 0 - (2b-1b)\n"	/* padding */
 			 ".previous\n"
+			 /* skipping size check since replacement size = 0 */
 			 : : "i" (bit) : : t_no);
 		return true;
 	t_no:
@@ -318,10 +318,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 			     _ASM_ALIGN "\n"
 			     _ASM_PTR "1b\n"
 			     _ASM_PTR "3f\n"
-			     " .byte %P1\n"		/* feature bit */
+			     " .word %P1\n"		/* feature bit */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
-			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
+			     ".previous\n"
+			     ".section .discard,\"aw\",@progbits\n"
+			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
 			     ".previous\n"
 			     ".section .altinstr_replacement,\"ax\"\n"
 			     "3: movb $1,%0\n"
@@ -337,7 +339,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
 		boot_cpu_has(bit) :				\
-	(__builtin_constant_p(bit) && !((bit) & ~0xff)) ?	\
+	__builtin_constant_p(bit) ?				\
 		__static_cpu_has(bit) :				\
 		boot_cpu_has(bit)				\
 )
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..7862cf510ea 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -914,7 +914,7 @@ ENTRY(simd_coprocessor_error)
 	.balign 4
 	.long 661b
 	.long 663f
-	.byte X86_FEATURE_XMM
+	.word X86_FEATURE_XMM
 	.byte 662b-661b
 	.byte 664f-663f
 .previous
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a..aa4326bfb24 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -52,7 +52,7 @@ ENDPROC(clear_page)
 	.align 8
 	.quad clear_page
 	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 	.byte .Lclear_page_end - clear_page
 	.byte 2b - 1b
 	.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2f..6fec2d1cebe 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -113,7 +113,7 @@ ENDPROC(copy_page)
 	.align 8
 	.quad copy_page
 	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 	.byte .Lcopy_page_end - copy_page
 	.byte 2b - 1b
 	.previous
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928a..bcbcd1e0f7d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -131,7 +131,7 @@ ENDPROC(__memcpy)
 	.align 8
 	.quad memcpy
 	.quad .Lmemcpy_c
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 
 	/*
 	 * Replace only beginning, memcpy is used to apply alternatives,
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e88d3b81644..09d34426965 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -121,7 +121,7 @@ ENDPROC(__memset)
 	.align 8
 	.quad memset
 	.quad .Lmemset_c
-	.byte X86_FEATURE_REP_GOOD
+	.word X86_FEATURE_REP_GOOD
 	.byte .Lfinal - memset
 	.byte .Lmemset_e - .Lmemset_c
 	.previous
-- 
cgit v1.2.3-70-g09d2


From bdc802dcca1709b01988d57e91f9f35ce1609fcc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 7 Jul 2010 17:29:18 -0700
Subject: x86, cpu: Support the features flags in new CPUID leaf 7

Intel has defined CPUID leaf 7 as the next set of feature flags (see
the AVX specification, version 007).  Add support for this new feature
flags word.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <tip-*@vger.kernel.org>
---
 arch/x86/include/asm/cpufeature.h        | 13 +++++++++----
 arch/x86/include/asm/required-features.h |  2 ++
 arch/x86/kernel/cpu/common.c             | 10 ++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aeb6f3f9b2c..3ec9275cea4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
 
 #include <asm/required-features.h>
 
-#define NCAPINTS	9	/* N 32-bit words worth of info */
+#define NCAPINTS	10	/* N 32-bit words worth of info */
 
 /*
  * Note: If the comment begins with a quoted string, that string is used
@@ -159,14 +159,14 @@
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc
+ * CPUID levels like 0x6, 0xA etc, word 7
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 
-/* Virtualization flags: Linux defined */
+/* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        (8*32+ 1) /* Intel Virtual NMI */
 #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
@@ -177,6 +177,9 @@
 #define X86_FEATURE_SVML	(8*32+7)  /* "svm_lock" AMD SVM locking MSR */
 #define X86_FEATURE_NRIPS	(8*32+8)  /* "nrip_save" AMD SVM next_rip save */
 
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE	(9*32+0)  /* {RD/WR}{FS/GS}BASE instructions*/
+
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
@@ -197,7 +200,9 @@ extern const char * const x86_power_flags[32];
 	   (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) ||	\
 	   (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) ||	\
 	   (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||	\
-	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) )	\
+	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ||	\
+	   (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) ||	\
+	   (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )	\
 	  ? 1 :								\
 	 test_cpu_cap(c, bit))
 
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad..6c7fc25f2c3 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
 #define REQUIRED_MASK5	0
 #define REQUIRED_MASK6	0
 #define REQUIRED_MASK7	0
+#define REQUIRED_MASK8	0
+#define REQUIRED_MASK9	0
 
 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211..c7358303d8c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -551,6 +551,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[4] = excap;
 	}
 
+	/* Additional Intel-defined flags: level 0x00000007 */
+	if (c->cpuid_level >= 0x00000007) {
+		u32 eax, ebx, ecx, edx;
+
+		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+		if (eax > 0)
+			c->x86_capability[9] = ebx;
+	}
+
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
-- 
cgit v1.2.3-70-g09d2


From 5bbd4a336c81d32df71642abf310cf3d0c98dc9b Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <martinez.javier@gmail.com>
Date: Wed, 7 Jul 2010 19:51:59 -0400
Subject: x86/apic/es7000_32: Remove unused variable

In today's linux-next I got this compile warning:

 arch/x86/kernel/apic/es7000_32.c:132: warning: 'base' defined but not used

Current patch solves the issue removing the unused variable.

Signed-off-by: Javier Martinez Canillas <martinez.javier@gmail.com>
Cc: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <1278546719.9020.4.camel@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/es7000_32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87fe..8593582d802 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int					es7000_plat;
  * GSI override for ES7000 platforms.
  */
 
-static unsigned int			base;
 
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
-- 
cgit v1.2.3-70-g09d2


From 9279aa55061a280b826bdf9ba5ab5f6a566c1dfb Mon Sep 17 00:00:00 2001
From: Ky Srinivasan <ksrinivasan@novell.com>
Date: Mon, 28 Jun 2010 08:48:55 -0600
Subject: x86: Export the symbol ms_hyperv

This is needed so that the staging hyperv can properly access this
symbol.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpu/mshyperv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b..d944bf6c50e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
 #include <asm/mshyperv.h>
 
 struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
 
 static bool __init ms_hyperv_platform(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 3b770a2128423a687e6e9c57184a584fb4ba4c77 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 13 Jul 2010 14:57:50 -0700
Subject: x86, alternatives: BUG on encountering an invalid CPU feature number

Make the alternatives-patching code BUG on encountering an invalid CPU
feature number.  Should have done this a long time ago.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Yinghai Lu <yinhai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <tip-df378ccfc4dd04e263426ad805516915874774aa@git.kernel.org>
---
 arch/x86/kernel/alternative.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c..f65ab8b014c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		u8 *instr = a->instr;
 		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
+		BUG_ON(a->cpuid >= NCAPINTS*32);
 		if (!boot_cpu_has(a->cpuid))
 			continue;
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3-70-g09d2


From b2691085d1f3ccce641dcfdd02722ba5d34db6ba Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 28 Jun 2010 16:46:48 -0700
Subject: x86: Clean up arch/x86/kernel/cpu/mtrr/cleanup.c: use ";" not "," to
 terminate statements

Also needed if pr_<level> becomes a bit more space efficient.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
LKML-Reference: <1277768808.29157.280.camel@Joe-Laptop.home>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f01..c5f59d07142 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
 	unsigned long gran_base, chunk_base, lose_base;
 	char gran_factor, chunk_factor, lose_factor;
 
-	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
-	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
-	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
 
 	pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
 		result[i].bad ? "*BAD*" : " ",
-- 
cgit v1.2.3-70-g09d2


From f33ebbe9da2c3c24664a0ad4f8fd83f293547e63 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 6 May 2010 20:17:00 +0200
Subject: unistd: add __NR_prlimit64 syscall numbers

Add __NR_prlimit64 syscall numbers to asm-generic. Add them also to
asm-x86, both 32 and 64-bit.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/include/asm/unistd_32.h   | 3 ++-
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 include/asm-generic/unistd.h       | 4 +++-
 5 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa..a88e31d1836 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,5 @@ ia32_sys_call_table:
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
+	.quad sys_prlimit64
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a..35e0cb151b6 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,11 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_prlimit64		338
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 339
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81..570bf5eae56 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg				299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_prlimit64				300
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b372934121..eca1d7d23ab 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_prlimit64
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 6a0b30f78a6..0dfd517e5ec 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -626,9 +626,11 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_recvmmsg 243
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_prlimit64 244
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
 
 #undef __NR_syscalls
-#define __NR_syscalls 244
+#define __NR_syscalls 245
 
 /*
  * All syscalls below here should go away really,
-- 
cgit v1.2.3-70-g09d2


From 93a7ca0c3ebe5d931126f1fb732cb9c4518383d4 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 16 Jul 2010 10:11:21 -0500
Subject: x86, UV: Initialize BAU MMRs only on hubs with cpus

Remove the initialization of MMRs
UVH_LB_BAU_SB_ACTIVATION_CONTROL and UVH_BAU_DATA_BROADCAST on
UV hubs that have no active cpus. Such initialization on hubs
with no active cpus would result in a kernel page fault.

This is not of real high priority, because we don't have any
such systems (with UV hubs that have no active cpus).  But they
will be coming.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1OZmZN-0006cW-RC@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index abf3c31f14c..59efb5390b3 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1635,12 +1635,16 @@ static int __init uv_bau_init(void)
 	alloc_intr_gate(vector, uv_bau_message_intr1);
 
 	for_each_possible_blade(uvhub) {
-		pnode = uv_blade_to_pnode(uvhub);
-		/* INIT the bau */
-		uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
-				      ((unsigned long)1 << 63));
-		mmr = 1; /* should be 1 to broadcast to both sockets */
-		uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+		if (uv_blade_nr_possible_cpus(uvhub)) {
+			pnode = uv_blade_to_pnode(uvhub);
+			/* INIT the bau */
+			uv_write_global_mmr64(pnode,
+					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+					((unsigned long)1 << 63));
+			mmr = 1; /* should be 1 to broadcast to both sockets */
+			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+						mmr);
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From a2531293dbb7608fa672ff28efe3ab4027917a2f Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 18 Jul 2010 14:27:13 +0200
Subject: update email address

pavel@suse.cz no longer works, replace it with working address.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/feature-removal-schedule.txt | 2 +-
 Documentation/hwmon/hpfall.c               | 2 +-
 Documentation/power/tricks.txt             | 2 +-
 Documentation/sparse.txt                   | 2 +-
 Documentation/zh_CN/sparse.txt             | 2 +-
 arch/arm/mach-sa1100/collie.c              | 2 +-
 arch/powerpc/kernel/suspend.c              | 2 +-
 arch/x86/kernel/acpi/sleep.c               | 2 +-
 arch/x86/kernel/apm_32.c                   | 2 +-
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c  | 2 +-
 arch/x86/mm/init_64.c                      | 2 +-
 arch/x86/power/cpu.c                       | 2 +-
 arch/x86/power/hibernate_64.c              | 2 +-
 drivers/block/nbd.c                        | 2 +-
 drivers/media/video/usbvideo/vicam.c       | 2 +-
 drivers/media/video/v4l2-compat-ioctl32.c  | 2 +-
 drivers/staging/winbond/wbusb.c            | 2 +-
 drivers/usb/class/cdc-acm.c                | 2 +-
 drivers/usb/class/usblp.c                  | 2 +-
 drivers/video/backlight/locomolcd.c        | 4 ++--
 fs/compat.c                                | 2 +-
 fs/compat_ioctl.c                          | 2 +-
 kernel/debug/debug_core.c                  | 2 +-
 kernel/debug/gdbstub.c                     | 2 +-
 kernel/power/hibernate.c                   | 2 +-
 kernel/power/snapshot.c                    | 2 +-
 kernel/power/swap.c                        | 2 +-
 27 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index c268783bc4e..1a0fc32bc20 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -93,7 +93,7 @@ Why:	Broken design for runtime control over driver power states, confusing
 	inputs.  This framework was never widely used, and most attempts to
 	use it were broken.  Drivers should instead be exposing domain-specific
 	interfaces either to kernel or to userspace.
-Who:	Pavel Machek <pavel@suse.cz>
+Who:	Pavel Machek <pavel@ucw.cz>
 
 ---------------------------
 
diff --git a/Documentation/hwmon/hpfall.c b/Documentation/hwmon/hpfall.c
index 681ec22b9d0..a4a8fc5d05d 100644
--- a/Documentation/hwmon/hpfall.c
+++ b/Documentation/hwmon/hpfall.c
@@ -1,7 +1,7 @@
 /* Disk protection for HP machines.
  *
  * Copyright 2008 Eric Piel
- * Copyright 2009 Pavel Machek <pavel@suse.cz>
+ * Copyright 2009 Pavel Machek <pavel@ucw.cz>
  *
  * GPLv2.
  */
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
index 3b26bb502a4..a1b8f7249f4 100644
--- a/Documentation/power/tricks.txt
+++ b/Documentation/power/tricks.txt
@@ -1,6 +1,6 @@
 	swsusp/S3 tricks
 	~~~~~~~~~~~~~~~~
-Pavel Machek <pavel@suse.cz>
+Pavel Machek <pavel@ucw.cz>
 
 If you want to trick swsusp/S3 into working, you might want to try:
 
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
index 9b659c79a54..4909d411635 100644
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -1,5 +1,5 @@
 Copyright 2004 Linus Torvalds
-Copyright 2004 Pavel Machek <pavel@suse.cz>
+Copyright 2004 Pavel Machek <pavel@ucw.cz>
 Copyright 2006 Bob Copeland <me@bobcopeland.com>
 
 Using sparse for typechecking
diff --git a/Documentation/zh_CN/sparse.txt b/Documentation/zh_CN/sparse.txt
index 75992a603ae..cc144e58151 100644
--- a/Documentation/zh_CN/sparse.txt
+++ b/Documentation/zh_CN/sparse.txt
@@ -22,7 +22,7 @@ Documentation/sparse.txt 的中文翻译
 ---------------------------------------------------------------------
 
 Copyright 2004 Linus Torvalds
-Copyright 2004 Pavel Machek <pavel@suse.cz>
+Copyright 2004 Pavel Machek <pavel@ucw.cz>
 Copyright 2006 Bob Copeland <me@bobcopeland.com>
 
 使用 sparse 工具做类型检查
diff --git a/arch/arm/mach-sa1100/collie.c b/arch/arm/mach-sa1100/collie.c
index 5d5f330c5d9..16e682d5dbb 100644
--- a/arch/arm/mach-sa1100/collie.c
+++ b/arch/arm/mach-sa1100/collie.c
@@ -11,7 +11,7 @@
  * published by the Free Software Foundation.
  *
  * ChangeLog:
- *  2006 Pavel Machek <pavel@suse.cz>
+ *  2006 Pavel Machek <pavel@ucw.cz>
  *  03-06-2004 John Lenz <lenz@cs.wisc.edu>
  *  06-04-2002 Chris Larson <kergoth@digitalnemesis.net>
  *  04-16-2001 Lineo Japan,Inc. ...
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index 6fc6328dc62..0167d53da30 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -3,7 +3,7 @@
  *
  * Distribute under GPLv2
  *
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b9..f51cc55aced 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
  * sleep.c - x86-specific ACPI sleep support.
  *
  *  Copyright (C) 2001-2003 Patrick Mochel
- *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
  */
 
 #include <linux/acpi.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3a..4c9c67bf09b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
  *         is now the way life works).
  *         Fix thinko in suspend() (wrong return).
  *         Notify drivers on critical suspend.
- *         Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ *         Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
  *         modified by sfr).
  *         Disable interrupts while we are suspended (Andy Henroid
  *         <andy_henroid@yahoo.com> fixed by sfr).
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7ec2123838e..0af9aa20fce 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
  *  Based on the powernow-k7.c module written by Dave Jones.
  *  (C) 2003 Dave Jones on behalf of SuSE Labs
  *  (C) 2004 Dominik Brodowski <linux@brodo.de>
- *  (C) 2004 Pavel Machek <pavel@suse.cz>
+ *  (C) 2004 Pavel Machek <pavel@ucw.cz>
  *  Licensed under the terms of the GNU GPL License version 2.
  *  Based upon datasheets & sample CPUs kindly provided by AMD.
  *
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d..9a6674689a2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
  *  linux/arch/x86_64/mm/init.c
  *
  *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
  */
 
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b35..e7e8c5f5495 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
  * Distribute under GPLv2
  *
  * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e..460f314d13e 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
  * Distribute under GPLv2
  *
  * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 218d091f3c5..16c3c8613cd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -4,7 +4,7 @@
  * Note that you can not swap over this thing, yet. Seems to work but
  * deadlocks sometimes - you can not swap over TCP in general.
  * 
- * Copyright 1997-2000, 2008 Pavel Machek <pavel@suse.cz>
+ * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  *
  * This file is released under GPLv2 or later.
diff --git a/drivers/media/video/usbvideo/vicam.c b/drivers/media/video/usbvideo/vicam.c
index 6030410c667..5d6fd01f918 100644
--- a/drivers/media/video/usbvideo/vicam.c
+++ b/drivers/media/video/usbvideo/vicam.c
@@ -2,7 +2,7 @@
  * USB ViCam WebCam driver
  * Copyright (c) 2002 Joe Burks (jburks@wavicle.org),
  *                    Christopher L Cheney (ccheney@cheney.cx),
- *                    Pavel Machek (pavel@suse.cz),
+ *                    Pavel Machek (pavel@ucw.cz),
  *                    John Tyner (jtyner@cs.ucr.edu),
  *                    Monroe Williams (monroe@pobox.com)
  *
diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c
index 9004a5fe764..d2f20c2acae 100644
--- a/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/drivers/media/video/v4l2-compat-ioctl32.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
  * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
  * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs
- * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
  * Copyright (C) 2005       Philippe De Muyter (phdm@macqel.be)
  * Copyright (C) 2008       Hans Verkuil <hverkuil@xs4all.nl>
  *
diff --git a/drivers/staging/winbond/wbusb.c b/drivers/staging/winbond/wbusb.c
index 681419d6856..251caa052ee 100644
--- a/drivers/staging/winbond/wbusb.c
+++ b/drivers/staging/winbond/wbusb.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2008 Pavel Machek <pavel@suse.cz>
+ * Copyright 2008 Pavel Machek <pavel@ucw.cz>
  *
  * Distribute under GPLv2.
  *
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 61d75507d5d..8413a567c12 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -2,7 +2,7 @@
  * cdc-acm.c
  *
  * Copyright (c) 1999 Armin Fuerst	<fuerst@in.tum.de>
- * Copyright (c) 1999 Pavel Machek	<pavel@suse.cz>
+ * Copyright (c) 1999 Pavel Machek	<pavel@ucw.cz>
  * Copyright (c) 1999 Johannes Erdfelt	<johannes@erdfelt.com>
  * Copyright (c) 2000 Vojtech Pavlik	<vojtech@suse.cz>
  * Copyright (c) 2004 Oliver Neukum	<oliver@neukum.name>
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 2250095db0a..84f9e52327f 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -2,7 +2,7 @@
  * usblp.c
  *
  * Copyright (c) 1999 Michael Gee	<michael@linuxspecific.com>
- * Copyright (c) 1999 Pavel Machek	<pavel@suse.cz>
+ * Copyright (c) 1999 Pavel Machek	<pavel@ucw.cz>
  * Copyright (c) 2000 Randy Dunlap	<rdunlap@xenotime.net>
  * Copyright (c) 2000 Vojtech Pavlik	<vojtech@suse.cz>
  # Copyright (c) 2001 Pete Zaitcev	<zaitcev@redhat.com>
diff --git a/drivers/video/backlight/locomolcd.c b/drivers/video/backlight/locomolcd.c
index 7571bc26071..d2f59015d51 100644
--- a/drivers/video/backlight/locomolcd.c
+++ b/drivers/video/backlight/locomolcd.c
@@ -2,7 +2,7 @@
  * Backlight control code for Sharp Zaurus SL-5500
  *
  * Copyright 2005 John Lenz <lenz@cs.wisc.edu>
- * Maintainer: Pavel Machek <pavel@suse.cz> (unless John wants to :-)
+ * Maintainer: Pavel Machek <pavel@ucw.cz> (unless John wants to :-)
  * GPL v2
  *
  * This driver assumes single CPU. That's okay, because collie is
@@ -246,6 +246,6 @@ static void __exit locomolcd_exit(void)
 module_init(locomolcd_init);
 module_exit(locomolcd_exit);
 
-MODULE_AUTHOR("John Lenz <lenz@cs.wisc.edu>, Pavel Machek <pavel@suse.cz>");
+MODULE_AUTHOR("John Lenz <lenz@cs.wisc.edu>, Pavel Machek <pavel@ucw.cz>");
 MODULE_DESCRIPTION("Collie LCD driver");
 MODULE_LICENSE("GPL");
diff --git a/fs/compat.c b/fs/compat.c
index 6490d2134ff..c6fda9aeb86 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -8,7 +8,7 @@
  *  Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
  *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
  *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- *  Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2 as
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae..5ead3763bba 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -4,7 +4,7 @@
  * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
  * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
  * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
  *
  * These routines maintain argument size conversion between 32bit and 64bit
  * ioctls.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cb7cd1de10..568efbce80f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2000-2001 VERITAS Software Corporation.
  * Copyright (C) 2002-2004 Timesys Corporation
  * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
  * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
  * Copyright (C) 2005-2009 Wind River Systems, Inc.
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 4b17b326952..4e584721bcb 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2000-2001 VERITAS Software Corporation.
  * Copyright (C) 2002-2004 Timesys Corporation
  * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
  * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
  * Copyright (C) 2005-2009 Wind River Systems, Inc.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d..6b202e7f8b5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003 Patrick Mochel
  * Copyright (c) 2003 Open Source Development Lab
- * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
  *
  * This file is released under the GPLv2.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8..f6cd6faf84f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
  *
  * This file provides system snapshot/restore functionality for swsusp.
  *
- * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  *
  * This file is released under the GPLv2.
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb2177839..48a0aa9da16 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
  * This file provides functions for reading the suspend image from
  * and writing it to a swap partition.
  *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  *
  * This file is released under the GPLv2.
-- 
cgit v1.2.3-70-g09d2


From 6c54aabd5e687092557f4881ce2d4013b971f293 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 3 Jul 2010 12:03:51 -0400
Subject: x86/amd-iommu: Use for_each_pci_dev()

Use for_each_pci_dev() to simplify the code.

Signed-off-by: Kulikov Vasiliy <segooon@gmail.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c..29dd3b9f2f0 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2609,8 +2609,7 @@ int __init amd_iommu_init_passthrough(void)
 
 	pt_domain->mode |= PAGE_MODE_NONE;
 
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
+	for_each_pci_dev(dev) {
 		if (!check_device(&dev->dev))
 			continue;
 
-- 
cgit v1.2.3-70-g09d2


From edb18f8ab02843453306601c4aa697f9691129cd Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:50 -0700
Subject: x86, cpu: Make init_scattered_cpuid_features() consider cpuid
 subleaves

Some cpuid features (like xsaveopt) are enumerated using cpuid
subleaves.

Extend init_scattered_cpuid_features() to take subleaf into account.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.439900717@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 7369b4c2c55..03cf24a3d93 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -14,6 +14,7 @@ struct cpuid_bit {
 	u8 reg;
 	u8 bit;
 	u32 level;
+	u32 sub_leaf;
 };
 
 enum cpuid_regs {
@@ -30,16 +31,16 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
-		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
-		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006 },
-		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
-		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
-		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
-		{ X86_FEATURE_SVML,  		CR_EDX, 2, 0x8000000a },
-		{ X86_FEATURE_NRIPS, 		CR_EDX, 3, 0x8000000a },
-		{ 0, 0, 0, 0 }
+		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
+		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
+		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
+		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
+		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
+		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
+		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
+		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
+		{ 0, 0, 0, 0, 0 }
 	};
 
 	for (cb = cpuid_bits; cb->feature; cb++) {
@@ -50,8 +51,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		    max_level > (cb->level | 0xffff))
 			continue;
 
-		cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
-			&regs[CR_ECX], &regs[CR_EDX]);
+		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
 
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
-- 
cgit v1.2.3-70-g09d2


From 5734f62b6601d88fd8ec720cb56b93fd3a030557 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:52 -0700
Subject: x86, cpu: Enumerate xsaveopt

Enumerate the xsaveopt feature.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 03cf24a3d93..41eebcd90fc 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -35,6 +35,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
+		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
 		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
 		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
-- 
cgit v1.2.3-70-g09d2


From a1488f8bf4d72ad724700f6e982469a1240e4264 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:48 -0700
Subject: x86, xsave: Track the offset, size of state in the xsave layout

Subleaves of the cpuid vector 0xd provides the offset and size of different
feature state that are managed by the xsave/xrstor. Track this for the upcoming
usage during signal handling.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.262987929@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 980149867a1..4993caa4181 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -21,6 +21,8 @@ struct _fpx_sw_bytes fx_sw_reserved;
 struct _fpx_sw_bytes fx_sw_reserved_ia32;
 #endif
 
+static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
@@ -301,6 +303,31 @@ void __cpuinit xsave_init(void)
 	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 
+/*
+ * Record the offsets and sizes of different state managed by the xsave
+ * memory layout.
+ */
+static void setup_xstate_features(void)
+{
+	int eax, ebx, ecx, edx, leaf = 0x2;
+
+	xstate_features = fls64(pcntxt_mask);
+	xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
+	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
+
+	do {
+		cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx);
+
+		if (eax == 0)
+			break;
+
+		xstate_offsets[leaf] = ebx;
+		xstate_sizes[leaf] = eax;
+
+		leaf++;
+	} while (1);
+}
+
 /*
  * setup the xstate image representing the init state
  */
@@ -308,6 +335,8 @@ static void __init setup_xstate_init(void)
 {
 	init_xstate_buf = alloc_bootmem(xstate_size);
 	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+
+	setup_xstate_features();
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 29104e101d710dd152f807978884643a52eca8b7 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:49 -0700
Subject: x86, xsave: Sync xsave memory layout with its header for user
 handling

With xsaveopt, if a processor implementation discern that a processor state
component is in its initialized state it may modify the corresponding bit in
the xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
layout. Hence wHile presenting the xstate information to the user, we always
ensure that the memory layout of a feature will be in the init state if the
corresponding header bit is zero. This ensures the consistency and avoids the
condition of the user seeing some some stale state in the memory layout during
signal handling, debugging etc.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.351459480@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  | 14 +++++++
 arch/x86/include/asm/xsave.h | 10 +++++
 arch/x86/kernel/i387.c       | 11 ++++++
 arch/x86/kernel/xsave.c      | 89 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 123 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..bb370fd0a1c 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -58,11 +58,25 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
+static __always_inline __pure bool use_xsaveopt(void)
+{
+	return 0;
+}
+
 static __always_inline __pure bool use_xsave(void)
 {
 	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
+extern void __sanitize_i387_state(struct task_struct *);
+
+static inline void sanitize_i387_state(struct task_struct *tsk)
+{
+	if (!use_xsaveopt())
+		return;
+	__sanitize_i387_state(tsk);
+}
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..0c72adc0cb1 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -111,6 +111,16 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
 		     :   "memory");
 }
 
+static inline void xsave_state(struct xsave_struct *fx, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
+		     : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+		     :   "memory");
+}
+
 static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b3225..6106af9fd12 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -190,6 +190,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	sanitize_i387_state(target);
+
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				   &target->thread.fpu.state->fxsave, 0, -1);
 }
@@ -207,6 +209,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	sanitize_i387_state(target);
+
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 &target->thread.fpu.state->fxsave, 0, -1);
 
@@ -446,6 +450,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 					   -1);
 	}
 
+	sanitize_i387_state(target);
+
 	if (kbuf && pos == 0 && count == sizeof(env)) {
 		convert_from_fxsr(kbuf, target);
 		return 0;
@@ -467,6 +473,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	sanitize_i387_state(target);
+
 	if (!HAVE_HWFP)
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
@@ -533,6 +541,9 @@ static int save_i387_xsave(void __user *buf)
 	struct _fpstate_ia32 __user *fx = buf;
 	int err = 0;
 
+
+	sanitize_i387_state(tsk);
+
 	/*
 	 * For legacy compatible, we always set FP/SSE bits in the bit
 	 * vector while saving the state to the user context.
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 4993caa4181..368047c8d50 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -23,6 +23,76 @@ struct _fpx_sw_bytes fx_sw_reserved_ia32;
 
 static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
 
+/*
+ * If a processor implementation discern that a processor state component is
+ * in its initialized state it may modify the corresponding bit in the
+ * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
+ * layout in the case of xsaveopt. While presenting the xstate information to
+ * the user, we always ensure that the memory layout of a feature will be in
+ * the init state if the corresponding header bit is zero. This is to ensure
+ * that the user doesn't see some stale state in the memory layout during
+ * signal handling, debugging etc.
+ */
+void __sanitize_i387_state(struct task_struct *tsk)
+{
+	u64 xstate_bv;
+	int feature_bit = 0x2;
+	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+
+	if (!fx)
+		return;
+
+	BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+
+	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
+
+	/*
+	 * None of the feature bits are in init state. So nothing else
+	 * to do for us, as the memory layout is upto date.
+	 */
+	if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
+		return;
+
+	/*
+	 * FP is in init state
+	 */
+	if (!(xstate_bv & XSTATE_FP)) {
+		fx->cwd = 0x37f;
+		fx->swd = 0;
+		fx->twd = 0;
+		fx->fop = 0;
+		fx->rip = 0;
+		fx->rdp = 0;
+		memset(&fx->st_space[0], 0, 128);
+	}
+
+	/*
+	 * SSE is in init state
+	 */
+	if (!(xstate_bv & XSTATE_SSE))
+		memset(&fx->xmm_space[0], 0, 256);
+
+	xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
+
+	/*
+	 * Update all the other memory layouts for which the corresponding
+	 * header bit is in the init state.
+	 */
+	while (xstate_bv) {
+		if (xstate_bv & 0x1) {
+			int offset = xstate_offsets[feature_bit];
+			int size = xstate_sizes[feature_bit];
+
+			memcpy(((void *) fx) + offset,
+			       ((void *) init_xstate_buf) + offset,
+			       size);
+		}
+
+		xstate_bv >>= 1;
+		feature_bit++;
+	}
+}
+
 /*
  * Check for the presence of extended state information in the
  * user fpstate pointer in the sigcontext.
@@ -112,6 +182,7 @@ int save_i387_xstate(void __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
+		sanitize_i387_state(tsk);
 		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
 				   xstate_size))
 			return -1;
@@ -333,10 +404,26 @@ static void setup_xstate_features(void)
  */
 static void __init setup_xstate_init(void)
 {
+	setup_xstate_features();
+
+	/*
+	 * Setup init_xstate_buf to represent the init state of
+	 * all the features managed by the xsave
+	 */
 	init_xstate_buf = alloc_bootmem(xstate_size);
 	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
 
-	setup_xstate_features();
+	clts();
+	/*
+	 * Init all the features state with header_bv being 0x0
+	 */
+	xrstor_state(init_xstate_buf, -1);
+	/*
+	 * Dump the init state again. This is to identify the init state
+	 * of any feature which is not represented by all zero's.
+	 */
+	xsave_state(init_xstate_buf, -1);
+	stts();
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 6bad06b768920e278c7cedfdda56a0b4c6a35ee9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Jul 2010 16:05:52 -0700
Subject: x86, xsave: Use xsaveopt in context-switch path when supported

xsaveopt is a more optimized form of xsave specifically designed
for the context switch usage. xsaveopt doesn't save the state that's not
modified from the prior xrstor. And if a specific feature state gets
modified to the init state, then xsaveopt just updates the header bit
in the xsave memory layout without updating the corresponding memory
layout.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  | 2 +-
 arch/x86/include/asm/xsave.h | 9 ++++++---
 arch/x86/kernel/cpu/common.c | 8 ++++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index bb370fd0a1c..59bd93ac7fe 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -60,7 +60,7 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 static __always_inline __pure bool use_xsaveopt(void)
 {
-	return 0;
+	return static_cpu_has(X86_FEATURE_XSAVEOPT);
 }
 
 static __always_inline __pure bool use_xsave(void)
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 0c72adc0cb1..ec86c5fd6a6 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -125,8 +125,11 @@ static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
-	__asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
-			     : : "D" (&(fpu->state->xsave)),
-				 "a" (-1), "d"(-1) : "memory");
+	alternative_input(
+		".byte " REX_PREFIX "0x0f,0xae,0x27",
+		".byte " REX_PREFIX "0x0f,0xae,0x37",
+		X86_FEATURE_XSAVEOPT,
+		[fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
+		"memory");
 }
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7358303d8c..3f715efc594 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 static int __init x86_xsave_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
 	return 1;
 }
 __setup("noxsave", x86_xsave_setup);
 
+static int __init x86_xsaveopt_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
-- 
cgit v1.2.3-70-g09d2


From 2decb194e65ab66eaf787512dc572cdc99893b24 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 19 Jul 2010 18:32:04 -0700
Subject: x86, cpu: Split addon_cpuid_features.c

addon_cpuid_features.c contains exactly two almost completely
unrelated functions, plus has a long and very generic name.  Split it
into two files, scattered.c for the scattered feature flags, and
topology.c for the topology information.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <tip-*@git.kernel.org>
---
 arch/x86/kernel/cpu/Makefile               |   2 +-
 arch/x86/kernel/cpu/addon_cpuid_features.c | 150 -----------------------------
 arch/x86/kernel/cpu/scattered.c            |  61 ++++++++++++
 arch/x86/kernel/cpu/topology.c             |  99 +++++++++++++++++++
 4 files changed, 161 insertions(+), 151 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/addon_cpuid_features.c
 create mode 100644 arch/x86/kernel/cpu/scattered.c
 create mode 100644 arch/x86/kernel/cpu/topology.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6..5e3a3512ba0 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,7 +12,7 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
 
-obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
+obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
deleted file mode 100644
index 41eebcd90fc..00000000000
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *	Routines to indentify additional cpu features that are scattered in
- *	cpuid space.
- */
-#include <linux/cpu.h>
-
-#include <asm/pat.h>
-#include <asm/processor.h>
-
-#include <asm/apic.h>
-
-struct cpuid_bit {
-	u16 feature;
-	u8 reg;
-	u8 bit;
-	u32 level;
-	u32 sub_leaf;
-};
-
-enum cpuid_regs {
-	CR_EAX = 0,
-	CR_ECX,
-	CR_EDX,
-	CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
-	u32 max_level;
-	u32 regs[4];
-	const struct cpuid_bit *cb;
-
-	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
-		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
-		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
-		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
-		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
-		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
-		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
-		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
-		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
-		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
-		{ 0, 0, 0, 0, 0 }
-	};
-
-	for (cb = cpuid_bits; cb->feature; cb++) {
-
-		/* Verify that the level is valid */
-		max_level = cpuid_eax(cb->level & 0xffff0000);
-		if (max_level < cb->level ||
-		    max_level > (cb->level | 0xffff))
-			continue;
-
-		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
-			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
-
-		if (regs[cb->reg] & (1 << cb->bit))
-			set_cpu_cap(c, cb->feature);
-	}
-}
-
-/* leaf 0xb SMT level */
-#define SMT_LEVEL	0
-
-/* leaf 0xb sub-leaf types */
-#define INVALID_TYPE	0
-#define SMT_TYPE	1
-#define CORE_TYPE	2
-
-#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
-
-/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
- */
-void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned int eax, ebx, ecx, edx, sub_index;
-	unsigned int ht_mask_width, core_plus_mask_width;
-	unsigned int core_select_mask, core_level_siblings;
-	static bool printed;
-
-	if (c->cpuid_level < 0xb)
-		return;
-
-	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
-
-	/*
-	 * check if the cpuid leaf 0xb is actually implemented.
-	 */
-	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
-		return;
-
-	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
-
-	/*
-	 * initial apic id, which also represents 32-bit extended x2apic id.
-	 */
-	c->initial_apicid = edx;
-
-	/*
-	 * Populate HT related information from sub-leaf level 0.
-	 */
-	core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
-	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-
-	sub_index = 1;
-	do {
-		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
-
-		/*
-		 * Check for the Core type in the implemented sub leaves.
-		 */
-		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
-			core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
-			core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-			break;
-		}
-
-		sub_index++;
-	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-
-	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
-
-	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
-						 & core_select_mask;
-	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
-	/*
-	 * Reinit the apicid, now that we have extended initial_apicid.
-	 */
-	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-
-	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
-	if (!printed) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       c->cpu_core_id);
-		printed = 1;
-	}
-	return;
-#endif
-}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 00000000000..9815364b477
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,61 @@
+/*
+ *	Routines to indentify additional cpu features that are scattered in
+ *	cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+	u16 feature;
+	u8 reg;
+	u8 bit;
+	u32 level;
+	u32 sub_leaf;
+};
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+	u32 max_level;
+	u32 regs[4];
+	const struct cpuid_bit *cb;
+
+	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
+		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
+		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
+		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
+		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
+		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
+		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
+		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
+		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
+		{ 0, 0, 0, 0, 0 }
+	};
+
+	for (cb = cpuid_bits; cb->feature; cb++) {
+
+		/* Verify that the level is valid */
+		max_level = cpuid_eax(cb->level & 0xffff0000);
+		if (max_level < cb->level ||
+		    max_level > (cb->level | 0xffff))
+			continue;
+
+		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+
+		if (regs[cb->reg] & (1 << cb->bit))
+			set_cpu_cap(c, cb->feature);
+	}
+}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
new file mode 100644
index 00000000000..4397e987a1c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.c
@@ -0,0 +1,99 @@
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+
+#include <linux/cpu.h>
+#include <asm/apic.h>
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+/* leaf 0xb SMT level */
+#define SMT_LEVEL	0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE	0
+#define SMT_TYPE	1
+#define CORE_TYPE	2
+
+#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned int eax, ebx, ecx, edx, sub_index;
+	unsigned int ht_mask_width, core_plus_mask_width;
+	unsigned int core_select_mask, core_level_siblings;
+	static bool printed;
+
+	if (c->cpuid_level < 0xb)
+		return;
+
+	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+	/*
+	 * check if the cpuid leaf 0xb is actually implemented.
+	 */
+	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+	/*
+	 * initial apic id, which also represents 32-bit extended x2apic id.
+	 */
+	c->initial_apicid = edx;
+
+	/*
+	 * Populate HT related information from sub-leaf level 0.
+	 */
+	core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+	sub_index = 1;
+	do {
+		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+		/*
+		 * Check for the Core type in the implemented sub leaves.
+		 */
+		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+			core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+			core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+			break;
+		}
+
+		sub_index++;
+	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
+						 & core_select_mask;
+	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+	/*
+	 * Reinit the apicid, now that we have extended initial_apicid.
+	 */
+	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+
+	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+
+	if (!printed) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		if (c->x86_max_cores > 1)
+			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+			       c->cpu_core_id);
+		printed = 1;
+	}
+	return;
+#endif
+}
-- 
cgit v1.2.3-70-g09d2


From fa10ba64ac94fec4611b79804023eb087862ffe0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 20 Jul 2010 15:19:49 -0700
Subject: x86, gcc-4.6: Fix set but not read variables

Just some dead code, no real bugs.

Found by gcc 4.6 -Wall

Signed-off-by: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <201007202219.o6KMJnQ0021072@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/aperture_64.c      | 4 ++--
 arch/x86/kernel/cpu/mtrr/generic.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf23..a2e0caf26e1 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
 	 * or BIOS forget to put that in reserved.
 	 * try to update e820 to make that region as reserved.
 	 */
-	u32 agp_aper_base = 0, agp_aper_order = 0;
+	u32 agp_aper_order = 0;
 	int i, fix, slot, valid_agp = 0;
 	u32 ctl;
 	u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
 		return;
 
 	/* This is mostly duplicate of iommu_hole_init */
-	agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+	search_agp_bridge(&agp_aper_order, &valid_agp);
 
 	fix = 0;
 	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61..7d28d7d0388 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 {
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
 	unsigned int tmp, hi;
-	int cpu;
 
 	/*
 	 * get_mtrr doesn't need to update mtrr_state, also it could be called
 	 * from any cpu, so try to print it out directly.
 	 */
-	cpu = get_cpu();
+	get_cpu();
 
 	rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
 
-- 
cgit v1.2.3-70-g09d2


From db10db48b2c530def21bfd76d576702c7df7f620 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 20 Jul 2010 20:50:49 +0200
Subject: x86, xsave: 32/64 bit boot cpu check unification in initialization

Boot cpu id is always 0, thus simplifying and unifying boot cpu check.

boot_cpu_id is there for historical reasons and was renamed to
boot_cpu_physical_apicid in patch:

 c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid

However, there are some remaining occurrences of boot_cpu_id that are
never touched in the kernel and thus its value is always 0.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279651857-24639-3-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3f715efc594..26804b2986b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1273,7 +1273,7 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Boot processor to setup the FP and extended state context info.
 	 */
-	if (smp_processor_id() == boot_cpu_id)
+	if (!smp_processor_id())
 		init_thread_xstate();
 
 	xsave_init();
-- 
cgit v1.2.3-70-g09d2


From 82d4150cec83b9775f84810b39a1c0b91585d429 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 20 Jul 2010 20:50:51 +0200
Subject: x86, xsave: Move boot cpu initialization to xsave_init()

This patch moves boot cpu initialization to xsave_init(). Now all cpus
are initialized in one single function.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279651857-24639-5-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c |  6 ------
 arch/x86/kernel/i387.c       |  5 -----
 arch/x86/kernel/xsave.c      | 14 ++++++++++++--
 3 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 26804b2986b..40561085d4f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1270,12 +1270,6 @@ void __cpuinit cpu_init(void)
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
-	/*
-	 * Boot processor to setup the FP and extended state context info.
-	 */
-	if (!smp_processor_id())
-		init_thread_xstate();
-
 	xsave_init();
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 6106af9fd12..2f32ef05f10 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -93,11 +93,6 @@ void __cpuinit fpu_init(void)
 
 	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
 
-	/*
-	 * Boot processor to setup the FP and extended state context info.
-	 */
-	if (!smp_processor_id())
-		init_thread_xstate();
 	xsave_init();
 
 	mxcsr_feature_mask_init();
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 368047c8d50..ab9ad48b653 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -360,7 +360,7 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
  * Enable the extended processor state save/restore feature
  */
-void __cpuinit xsave_init(void)
+static void __cpuinit __xsave_init(void)
 {
 	if (!cpu_has_xsave)
 		return;
@@ -446,7 +446,7 @@ void __ref xsave_cntxt_init(void)
 	 * Support only the state known to OS.
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-	xsave_init();
+	__xsave_init();
 
 	/*
 	 * Recompute the context size for enabled features
@@ -463,3 +463,13 @@ void __ref xsave_cntxt_init(void)
 	       "cntxt size 0x%x\n",
 	       pcntxt_mask, xstate_size);
 }
+
+void __cpuinit xsave_init(void)
+{
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (!smp_processor_id())
+		init_thread_xstate();
+	__xsave_init();
+}
-- 
cgit v1.2.3-70-g09d2


From 5edd19af18a36a4e22c570b1b969179e0ca1fe4c Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Tue, 20 Jul 2010 18:09:05 -0500
Subject: x86, UV: Make kdump avoid stack dumps

UV NMI callback's should not write stack dumps when a kdump is to be written.

When invoking the crash kernel to write a dump, kdump_nmi_shootdown_cpus()
uses NMI's to get all the cpu's to save their register context and halt.

But the NMI interrupt handler runs a callback list.  This patch sets a flag
to prevent any of those callbacks from interfering with the halt of the cpu.

For UV, which currently has the only callback to which this is relevant, the
uv_handle_nmi() callback should not do dumping of stacks.

The 'in_crash_kexec' flag is defined as an extern in kdebug.h firstly
because x2apic_uv_x.c includes it.  Secondly because some future callback
might need the flag to know that it should not enter the debugger.
(Such a scenario was in fact present in the 2.6.32 kernel, SuSE distribution,
 where a call to kdb needed to be avoided.)

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1ObLvt-0005UZ-Va@eag09.americas.sgi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/kdebug.h      | 1 +
 arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++++
 arch/x86/kernel/crash.c            | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b97476..7a2910b3512 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,5 +33,6 @@ extern void __show_regs(struct pt_regs *regs, int all);
 extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
+extern int in_crash_kexec;
 
 #endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e3..7b598b84c90 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
 	if (reason != DIE_NMI_IPI)
 		return NOTIFY_OK;
+
+	if (in_crash_kexec)
+		/* do nothing if entering the crash kernel */
+		return NOTIFY_OK;
 	/*
 	 * Use a lock so only one cpu prints at a time
 	 * to prevent intermixed output.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096..764c7c2b181 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
 #include <asm/reboot.h>
 #include <asm/virtext.h>
 
+int in_crash_kexec;
+
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 
 static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 
 static void kdump_nmi_shootdown_cpus(void)
 {
+	in_crash_kexec = 1;
 	nmi_shootdown_cpus(kdump_nmi_callback);
 
 	disable_local_APIC();
-- 
cgit v1.2.3-70-g09d2


From 0e49bf66d2ca649b167428adddbbbe9d9bd4894c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:52 +0200
Subject: x86, xsave: Separate fpu and xsave initialization

As xsave also supports other than fpu features, it should be
initialized independently of the fpu. This patch moves this out of fpu
initialization.

There is also a lot of cross referencing between fpu and xsave
code. This patch reduces this by making xsave_cntxt_init() and
init_thread_xstate() static functions.

The patch moves the cpu_has_xsave check at the beginning of
xsave_init(). All other checks may removed then.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-2-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  |  1 -
 arch/x86/include/asm/xsave.h |  1 -
 arch/x86/kernel/cpu/common.c |  2 ++
 arch/x86/kernel/i387.c       | 27 +++++++++++++++++++--------
 arch/x86/kernel/xsave.c      | 10 +++++-----
 5 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 59bd93ac7fe..509ddabeae2 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
 extern asmlinkage void math_state_restore(void);
 extern void __math_state_restore(void);
-extern void init_thread_xstate(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
 extern user_regset_active_fn fpregs_active, xfpregs_active;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 94d5f84d89f..4d3b5d1fc02 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -28,7 +28,6 @@ extern u64 pcntxt_mask;
 extern struct xsave_struct *init_xstate_buf;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
-extern void xsave_cntxt_init(void);
 extern void xsave_init(void);
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
 extern int init_fpu(struct task_struct *child);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 40561085d4f..94c36c7ac18 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1210,6 +1210,7 @@ void __cpuinit cpu_init(void)
 	dbg_restore_debug_regs();
 
 	fpu_init();
+	xsave_init();
 
 	raw_local_save_flags(kernel_eflags);
 
@@ -1270,6 +1271,7 @@ void __cpuinit cpu_init(void)
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
+	fpu_init();
 	xsave_init();
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 2f32ef05f10..e73c54ebafc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
 	stts();
 }
 
-void __cpuinit init_thread_xstate(void)
+static void __cpuinit init_thread_xstate(void)
 {
+	/*
+	 * Note that xstate_size might be overwriten later during
+	 * xsave_init().
+	 */
+
 	if (!HAVE_HWFP) {
 		xstate_size = sizeof(struct i387_soft_struct);
 		return;
 	}
 
-	if (cpu_has_xsave) {
-		xsave_cntxt_init();
-		return;
-	}
-
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
 #ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
  */
+
 void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
@@ -93,14 +94,24 @@ void __cpuinit fpu_init(void)
 
 	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
 
-	xsave_init();
+	if (!smp_processor_id())
+		init_thread_xstate();
 
 	mxcsr_feature_mask_init();
 	/* clean state in init */
 	current_thread_info()->status = 0;
 	clear_used_math();
 }
-#endif	/* CONFIG_X86_64 */
+
+#else	/* CONFIG_X86_64 */
+
+void __cpuinit fpu_init(void)
+{
+	if (!smp_processor_id())
+		init_thread_xstate();
+}
+
+#endif	/* CONFIG_X86_32 */
 
 static void fpu_finit(struct fpu *fpu)
 {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index ab9ad48b653..550bf45236f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -362,9 +362,6 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
  */
 static void __cpuinit __xsave_init(void)
 {
-	if (!cpu_has_xsave)
-		return;
-
 	set_in_cr4(X86_CR4_OSXSAVE);
 
 	/*
@@ -429,7 +426,7 @@ static void __init setup_xstate_init(void)
 /*
  * Enable and initialize the xsave feature.
  */
-void __ref xsave_cntxt_init(void)
+static void __cpuinit xsave_cntxt_init(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -466,10 +463,13 @@ void __ref xsave_cntxt_init(void)
 
 void __cpuinit xsave_init(void)
 {
+	if (!cpu_has_xsave)
+		return;
+
 	/*
 	 * Boot processor to setup the FP and extended state context info.
 	 */
 	if (!smp_processor_id())
-		init_thread_xstate();
+		xsave_cntxt_init();
 	__xsave_init();
 }
-- 
cgit v1.2.3-70-g09d2


From 97e80a70db689fb1e876df9f12305cc72f85ca53 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:53 +0200
Subject: x86, xsave: Introduce xstate enable functions

The patch renames xsave_cntxt_init() and __xsave_init() into
xstate_enable_boot_cpu() and xstate_enable() as this names are more
meaningful.

It also removes the duplicate xcr setup for the boot cpu.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-3-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 550bf45236f..2322f586c05 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -360,15 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
  * Enable the extended processor state save/restore feature
  */
-static void __cpuinit __xsave_init(void)
+static inline void xstate_enable(u64 mask)
 {
 	set_in_cr4(X86_CR4_OSXSAVE);
-
-	/*
-	 * Enable all the features that the HW is capable of
-	 * and the Linux kernel is aware of.
-	 */
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, mask);
 }
 
 /*
@@ -426,7 +421,7 @@ static void __init setup_xstate_init(void)
 /*
  * Enable and initialize the xsave feature.
  */
-static void __cpuinit xsave_cntxt_init(void)
+static void __cpuinit xstate_enable_boot_cpu(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -443,7 +438,8 @@ static void __cpuinit xsave_cntxt_init(void)
 	 * Support only the state known to OS.
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-	__xsave_init();
+
+	xstate_enable(pcntxt_mask);
 
 	/*
 	 * Recompute the context size for enabled features
@@ -470,6 +466,7 @@ void __cpuinit xsave_init(void)
 	 * Boot processor to setup the FP and extended state context info.
 	 */
 	if (!smp_processor_id())
-		xsave_cntxt_init();
-	__xsave_init();
+		xstate_enable_boot_cpu();
+	else
+		xstate_enable(pcntxt_mask);
 }
-- 
cgit v1.2.3-70-g09d2


From ee813d53a8e980a3a28318efb8935d45723f5211 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:54 +0200
Subject: x86, xsave: Check cpuid level for XSTATE_CPUID (0x0d)

The patch introduces the XSTATE_CPUID macro and adds a check that
tests if XSTATE_CPUID exists.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-4-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/xsave.h |  2 ++
 arch/x86/kernel/xsave.c      | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 4d3b5d1fc02..d1b5f3a2fa2 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -4,6 +4,8 @@
 #include <linux/types.h>
 #include <asm/processor.h>
 
+#define XSTATE_CPUID		0x0000000d
+
 #define XSTATE_FP	0x1
 #define XSTATE_SSE	0x2
 #define XSTATE_YMM	0x4
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 2322f586c05..5adb7fb408f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -379,7 +379,7 @@ static void setup_xstate_features(void)
 	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
 
 	do {
-		cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx);
+		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
 
 		if (eax == 0)
 			break;
@@ -425,7 +425,12 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
-	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+		WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
+		return;
+	}
+
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	pcntxt_mask = eax + ((u64)edx << 32);
 
 	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -444,7 +449,7 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 	/*
 	 * Recompute the context size for enabled features
 	 */
-	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 	xstate_size = ebx;
 
 	update_regset_xstate_info(xstate_size, pcntxt_mask);
-- 
cgit v1.2.3-70-g09d2


From 45c2d7f46211a0b1f6b425c59575c53145afc4b4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:55 +0200
Subject: x86, xsave: Make init_xstate_buf static

The pointer is only used in xsave.c. Making it static.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-5-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/xsave.h |  1 -
 arch/x86/kernel/xsave.c      | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index d1b5f3a2fa2..0ae6b996198 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,7 +27,6 @@
 
 extern unsigned int xstate_size;
 extern u64 pcntxt_mask;
-extern struct xsave_struct *init_xstate_buf;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void xsave_init(void);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 5adb7fb408f..3b44a9b1eca 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,6 +16,11 @@
  */
 u64 pcntxt_mask;
 
+/*
+ * Represents init state for the supported extended state.
+ */
+static struct xsave_struct *init_xstate_buf;
+
 struct _fpx_sw_bytes fx_sw_reserved;
 #ifdef CONFIG_IA32_EMULATION
 struct _fpx_sw_bytes fx_sw_reserved_ia32;
@@ -348,11 +353,6 @@ static void prepare_fx_sw_frame(void)
 #endif
 }
 
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
 #ifdef CONFIG_X86_64
 unsigned int sig_xstate_size = sizeof(struct _fpstate);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 4995b9dba908436c1611454f9bd2cb3ddf6babee Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:56 +0200
Subject: x86, xsave: Add __init attribute to setup_xstate_features()

This is called only from initialization code.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-6-git-send-email-robert.richter@amd.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 3b44a9b1eca..cfc7901ee94 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -370,7 +370,7 @@ static inline void xstate_enable(u64 mask)
  * Record the offsets and sizes of different state managed by the xsave
  * memory layout.
  */
-static void setup_xstate_features(void)
+static void __init setup_xstate_features(void)
 {
 	int eax, ebx, ecx, edx, leaf = 0x2;
 
-- 
cgit v1.2.3-70-g09d2


From 1cff92d8fdb27684308864d9cdb324bee43b40ab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 21 Jul 2010 14:23:10 -0700
Subject: x86, xsave: Make xstate_enable_boot_cpu() __init, protect on CPU 0

xstate_enable_boot_cpu() is, as the name implies, only used on the
boot CPU; furthermore, it invokes alloc_bootmem(), which is __init;
hence it needs to be tagged __init rather than __cpuinit.

Furthermore, it is *not* safe in the long run to rely on CPU 0 only
coming online during the early boot -- at some point we're going to
support offlining (and re-onlining) the boot CPU, and at that point we
must not call xstate_enable_boot_cpu() again.

The code is a fair bit more obscure than one would like, because the
__ref overrides aren't quite powerful enough.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <4C476236.1020302@zytor.com>
---
 arch/x86/kernel/xsave.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index cfc7901ee94..b2549c3eb2c 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -360,10 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
  * Enable the extended processor state save/restore feature
  */
-static inline void xstate_enable(u64 mask)
+static inline void xstate_enable(void)
 {
 	set_in_cr4(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, mask);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 
 /*
@@ -421,7 +421,7 @@ static void __init setup_xstate_init(void)
 /*
  * Enable and initialize the xsave feature.
  */
-static void __cpuinit xstate_enable_boot_cpu(void)
+static void __init xstate_enable_boot_cpu(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -444,7 +444,7 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
 
-	xstate_enable(pcntxt_mask);
+	xstate_enable();
 
 	/*
 	 * Recompute the context size for enabled features
@@ -462,16 +462,22 @@ static void __cpuinit xstate_enable_boot_cpu(void)
 	       pcntxt_mask, xstate_size);
 }
 
+/*
+ * For the very first instance, this calls xstate_enable_boot_cpu();
+ * for all subsequent instances, this calls xstate_enable().
+ *
+ * This is somewhat obfuscated due to the lack of powerful enough
+ * overrides for the section checks.
+ */
 void __cpuinit xsave_init(void)
 {
+	static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
+	void (*this_func)(void);
+
 	if (!cpu_has_xsave)
 		return;
 
-	/*
-	 * Boot processor to setup the FP and extended state context info.
-	 */
-	if (!smp_processor_id())
-		xstate_enable_boot_cpu();
-	else
-		xstate_enable(pcntxt_mask);
+	this_func = next_func;
+	next_func = xstate_enable;
+	this_func();
 }
-- 
cgit v1.2.3-70-g09d2


From cfaa71ee9794472598d3966c3315cd6bd8f953d3 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 17 Jul 2010 09:03:27 -0400
Subject: x86: Use symbolic MSR names

Use symbolic MSR names instead of hardcoding the MSR index.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1279371808-24804-2-git-send-email-brgerst@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/realmode/wakeup.S | 2 +-
 arch/x86/kernel/verify_cpu_64.S        | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e29601..28595d6df47 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
 	movl	%eax, %ecx
 	orl	%edx, %ecx
 	jz	1f
-	movl	$0xc0000080, %ecx
+	movl	$MSR_EFER, %ecx
 	wrmsr
 1:
 
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a..56a8c2a867d 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
  */
 
 #include <asm/cpufeature.h>
+#include <asm/msr-index.h>
 
 verify_cpu:
 	pushfl				# Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
 	je	verify_cpu_sse_ok
 	test	%di,%di
 	jz	verify_cpu_no_longmode	# only try to force SSE on AMD
-	movl	$0xc0010015,%ecx	# HWCR
+	movl	$MSR_K7_HWCR,%ecx
 	rdmsr
 	btr	$15,%eax		# enable SSE
 	wrmsr
-- 
cgit v1.2.3-70-g09d2


From 650fb4393dff543bc980d361555c489fbdeed088 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 17 Jul 2010 09:03:28 -0400
Subject: x86-64: Simplify loading initial_gs

Load initial_gs as two 32-bit values instead of splitting a 64-bit value.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1279371808-24804-3-git-send-email-brgerst@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_64.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a..239046bd447 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
 	 * init data section till per cpu areas are set up.
 	 */
 	movl	$MSR_GS_BASE,%ecx
-	movq	initial_gs(%rip),%rax
-	movq    %rax,%rdx
-	shrq	$32,%rdx
+	movl	initial_gs(%rip),%eax
+	movl	initial_gs+4(%rip),%edx
 	wrmsr	
 
 	/* esi is pointer to real mode structure with interesting info.
-- 
cgit v1.2.3-70-g09d2


From 4c21adf26f8fcf86a755b9b9f55c2e9fd241e1fb Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Tue, 20 Jul 2010 16:59:34 -0700
Subject: x86 cpufreq, perf: Make trace_power_frequency cpufreq driver
 independent

and fix the broken case if a core's frequency depends on others.

trace_power_frequency was only implemented in a rather ungeneric
way in acpi-cpufreq driver's target() function only.

-> Move the call to trace_power_frequency to
   cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE
   notifier is triggered.
   This will support power frequency tracing by all cpufreq
   drivers.

trace_power_frequency did not trace frequency changes correctly
when the userspace governor was used or when CPU cores'
frequency depend on each other.

-> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu
   which gets switched automatically fixes this.

Robert Schoene provided some important fixes on top of my
initial quick shot version which are integrated in this patch:
- Forgot some changes in power_end trace (TP_printk/variable names)
- Variable dummy in power_end must now be cpu_id
- Use static 64 bit variable instead of unsigned int for cpu_id

[akpm@linux-foundation.org: build fix]
Signed-off-by: Thomas Renninger <trenn@suse.de>
Cc: davej@codemonkey.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Dave Jones <davej@codemonkey.org.uk>
Acked-by: Arjan van de Ven <arjan@infradead.org>
Cc: Robert Schoene <robert.schoene@tu-dresden.de>
Tested-by: Robert Schoene <robert.schoene@tu-dresden.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  3 ---
 arch/x86/kernel/process.c                  |  8 ++++----
 drivers/cpufreq/cpufreq.c                  |  3 +++
 drivers/cpuidle/cpuidle.c                  |  2 +-
 drivers/idle/intel_idle.c                  |  2 +-
 include/trace/events/power.h               | 27 +++++++++++++++------------
 tools/perf/builtin-timechart.c             | 11 ++++++-----
 7 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40e..cee5263927c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
 #include <linux/compiler.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
-#include <trace/events/power.h>
 
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..787572d43d9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -371,7 +371,7 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	trace_power_start(POWER_CSTATE, (ax>>4)+1);
+	trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
 	if (!need_resched()) {
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
@@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 static void mwait_idle(void)
 {
 	if (!need_resched()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -478,7 +478,7 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	trace_power_start(POWER_CSTATE, 0);
+	trace_power_start(POWER_CSTATE, 0, smp_processor_id());
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 063b2184caf..4ed665725cc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -29,6 +29,8 @@
 #include <linux/completion.h>
 #include <linux/mutex.h>
 
+#include <trace/events/power.h>
+
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
 						"cpufreq-core", msg)
 
@@ -354,6 +356,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 
 	case CPUFREQ_POSTCHANGE:
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
+                trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
 		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
 				CPUFREQ_POSTCHANGE, freqs);
 		if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 199488576a0..dbefe15bd58 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -95,7 +95,7 @@ static void cpuidle_idle_call(void)
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev);
-	trace_power_end(0);
+	trace_power_end(smp_processor_id());
 }
 
 /**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 54f0fb4cd5d..03d202b1ff2 100755
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -231,7 +231,7 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
 
 	stop_critical_timings();
 #ifndef MODULE
-	trace_power_start(POWER_CSTATE, (eax >> 4) + 1);
+	trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
 #endif
 	if (!need_resched()) {
 
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index c4efe9b8280..35a2a6e7bf1 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -18,52 +18,55 @@ enum {
 
 DECLARE_EVENT_CLASS(power,
 
-	TP_PROTO(unsigned int type, unsigned int state),
+	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
 
-	TP_ARGS(type, state),
+	TP_ARGS(type, state, cpu_id),
 
 	TP_STRUCT__entry(
 		__field(	u64,		type		)
 		__field(	u64,		state		)
+		__field(	u64,		cpu_id		)
 	),
 
 	TP_fast_assign(
 		__entry->type = type;
 		__entry->state = state;
+		__entry->cpu_id = cpu_id;
 	),
 
-	TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state)
+	TP_printk("type=%lu state=%lu cpu_id=%lu", (unsigned long)__entry->type,
+		(unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
 );
 
 DEFINE_EVENT(power, power_start,
 
-	TP_PROTO(unsigned int type, unsigned int state),
+	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
 
-	TP_ARGS(type, state)
+	TP_ARGS(type, state, cpu_id)
 );
 
 DEFINE_EVENT(power, power_frequency,
 
-	TP_PROTO(unsigned int type, unsigned int state),
+	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
 
-	TP_ARGS(type, state)
+	TP_ARGS(type, state, cpu_id)
 );
 
 TRACE_EVENT(power_end,
 
-	TP_PROTO(int dummy),
+	TP_PROTO(unsigned int cpu_id),
 
-	TP_ARGS(dummy),
+	TP_ARGS(cpu_id),
 
 	TP_STRUCT__entry(
-		__field(	u64,		dummy		)
+		__field(	u64,		cpu_id		)
 	),
 
 	TP_fast_assign(
-		__entry->dummy = 0xffff;
+		__entry->cpu_id = cpu_id;
 	),
 
-	TP_printk("dummy=%lu", (unsigned long)__entry->dummy)
+	TP_printk("cpu_id=%lu", (unsigned long)__entry->cpu_id)
 
 );
 
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 5a52ed9fc10..5161619d471 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -300,8 +300,9 @@ struct trace_entry {
 
 struct power_entry {
 	struct trace_entry te;
-	s64	type;
-	s64	value;
+	u64	type;
+	u64	value;
+	u64	cpu_id;
 };
 
 #define TASK_COMM_LEN 16
@@ -498,13 +499,13 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 			return 0;
 
 		if (strcmp(event_str, "power:power_start") == 0)
-			c_state_start(data.cpu, data.time, pe->value);
+			c_state_start(pe->cpu_id, data.time, pe->value);
 
 		if (strcmp(event_str, "power:power_end") == 0)
-			c_state_end(data.cpu, data.time);
+			c_state_end(pe->cpu_id, data.time);
 
 		if (strcmp(event_str, "power:power_frequency") == 0)
-			p_state_change(data.cpu, data.time, pe->value);
+			p_state_change(pe->cpu_id, data.time, pe->value);
 
 		if (strcmp(event_str, "sched:sched_wakeup") == 0)
 			sched_wakeup(data.cpu, data.time, data.pid, te);
-- 
cgit v1.2.3-70-g09d2


From bee6ab53e652a414af20392899879b58cd80d033 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Fri, 14 May 2010 12:39:33 +0100
Subject: x86: early PV on HVM features initialization.

Initialize basic pv on hvm features adding a new Xen HVM specific
hypervisor_x86 structure.

Don't try to initialize xen-kbdfront and xen-fbfront when running on HVM
because the backends are not available.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/hypervisor.h |   1 +
 arch/x86/kernel/cpu/hypervisor.c  |   1 +
 arch/x86/xen/enlighten.c          | 100 ++++++++++++++++++++++++++++++++++++++
 drivers/input/xen-kbdfront.c      |   2 +-
 drivers/video/xen-fbfront.c       |   2 +-
 drivers/xen/xenbus/xenbus_probe.c |  21 ++++++--
 6 files changed, 122 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c..ff2546ce717 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
 /* Recognized hypervisors */
 extern const struct hypervisor_x86 x86_hyper_vmware;
 extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
 
 #endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8..bffd47c10fe 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
+	&x86_hyper_xen_hvm,
 };
 
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a..09b36e9d507 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -35,6 +35,7 @@
 #include <xen/interface/version.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvc-console.h>
@@ -55,7 +56,9 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
+#include <asm/setup.h>
 #include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -76,6 +79,8 @@ struct shared_info xen_dummy_shared_info;
 
 void *xen_initial_gdt;
 
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
@@ -1206,3 +1211,98 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
+
+static uint32_t xen_cpuid_base(void)
+{
+	uint32_t base, eax, ebx, ecx, edx;
+	char signature[13];
+
+	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+		cpuid(base, &eax, &ebx, &ecx, &edx);
+		*(uint32_t *)(signature + 0) = ebx;
+		*(uint32_t *)(signature + 4) = ecx;
+		*(uint32_t *)(signature + 8) = edx;
+		signature[12] = 0;
+
+		if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+			return base;
+	}
+
+	return 0;
+}
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+	uint32_t eax, ebx, ecx, edx, pages, msr, base;
+	u64 pfn;
+
+	base = xen_cpuid_base();
+	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+	*major = eax >> 16;
+	*minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	xen_setup_features();
+
+	pv_info = xen_info;
+	pv_info.kernel_rpl = 0;
+
+	xen_domain_type = XEN_HVM_DOMAIN;
+
+	return 0;
+}
+
+static void __init init_shared_info(void)
+{
+	struct xen_add_to_physmap xatp;
+	struct shared_info *shared_info_page;
+
+	shared_info_page = (struct shared_info *)
+		extend_brk(PAGE_SIZE, PAGE_SIZE);
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+		BUG();
+
+	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+}
+
+static void __init xen_hvm_guest_init(void)
+{
+	int r;
+	int major, minor;
+
+	r = init_hvm_pv_info(&major, &minor);
+	if (r < 0)
+		return;
+
+	init_shared_info();
+}
+
+static bool __init xen_hvm_platform(void)
+{
+	if (xen_pv_domain())
+		return false;
+
+	if (!xen_cpuid_base())
+		return false;
+
+	return true;
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+	.name			= "Xen HVM",
+	.detect			= xen_hvm_platform,
+	.init_platform		= xen_hvm_guest_init,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
index e14081675bb..ebb11907d40 100644
--- a/drivers/input/xen-kbdfront.c
+++ b/drivers/input/xen-kbdfront.c
@@ -339,7 +339,7 @@ static struct xenbus_driver xenkbd_driver = {
 
 static int __init xenkbd_init(void)
 {
-	if (!xen_domain())
+	if (!xen_pv_domain())
 		return -ENODEV;
 
 	/* Nothing to do if running in dom0. */
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index fa97d3e7c21..7c7f42a1279 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -684,7 +684,7 @@ static struct xenbus_driver xenfb_driver = {
 
 static int __init xenfb_init(void)
 {
-	if (!xen_domain())
+	if (!xen_pv_domain())
 		return -ENODEV;
 
 	/* Nothing to do if running in dom0. */
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 3479332113e..d96fa75b45e 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -56,6 +56,8 @@
 #include <xen/events.h>
 #include <xen/page.h>
 
+#include <xen/hvm.h>
+
 #include "xenbus_comms.h"
 #include "xenbus_probe.h"
 
@@ -805,11 +807,24 @@ static int __init xenbus_probe_init(void)
 	if (xen_initial_domain()) {
 		/* dom0 not yet supported */
 	} else {
+		if (xen_hvm_domain()) {
+			uint64_t v = 0;
+			err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+			if (err)
+				goto out_error;
+			xen_store_evtchn = (int)v;
+			err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+			if (err)
+				goto out_error;
+			xen_store_mfn = (unsigned long)v;
+			xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+		} else {
+			xen_store_evtchn = xen_start_info->store_evtchn;
+			xen_store_mfn = xen_start_info->store_mfn;
+			xen_store_interface = mfn_to_virt(xen_store_mfn);
+		}
 		xenstored_ready = 1;
-		xen_store_evtchn = xen_start_info->store_evtchn;
-		xen_store_mfn = xen_start_info->store_mfn;
 	}
-	xen_store_interface = mfn_to_virt(xen_store_mfn);
 
 	/* Initialize the interface to xenstore. */
 	err = xs_init();
-- 
cgit v1.2.3-70-g09d2


From 38e20b07efd541a959de367dc90a17f92ce2e8a6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Fri, 14 May 2010 12:40:51 +0100
Subject: x86/xen: event channels delivery on HVM.

Set the callback to receive evtchns from Xen, using the
callback vector delivery mechanism.

The traditional way for receiving event channel notifications from Xen
is via the interrupts from the platform PCI device.
The callback vector is a newer alternative that allow us to receive
notifications on any vcpu and doesn't need any PCI support: we allocate
a vector exclusively to receive events, in the vector handler we don't
need to interact with the vlapic, therefore we avoid a VMEXIT.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/irq_vectors.h |  3 ++
 arch/x86/kernel/entry_32.S         |  3 ++
 arch/x86/kernel/entry_64.S         |  3 ++
 arch/x86/xen/enlighten.c           | 28 +++++++++++++++
 arch/x86/xen/xen-ops.h             |  2 ++
 drivers/xen/events.c               | 70 ++++++++++++++++++++++++++++++++++----
 include/xen/events.h               |  7 ++++
 include/xen/hvm.h                  |  6 ++++
 include/xen/interface/features.h   |  3 ++
 9 files changed, 118 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f6..e2ca3009255 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -125,6 +125,9 @@
  */
 #define MCE_SELF_VECTOR			0xeb
 
+/* Xen vector callback to receive events in a HVM domain */
+#define XEN_HVM_EVTCHN_CALLBACK		0xe9
+
 #define NR_VECTORS			 256
 
 #define FPU_IRQ				  13
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..6b196834a0d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1166,6 +1166,9 @@ ENTRY(xen_failsafe_callback)
 .previous
 ENDPROC(xen_failsafe_callback)
 
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+		xen_evtchn_do_upcall)
+
 #endif	/* CONFIG_XEN */
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0697ff13983..490ae2bb18a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
 	CFI_ENDPROC
 END(xen_failsafe_callback)
 
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+	xen_hvm_callback_vector xen_evtchn_do_upcall
+
 #endif /* CONFIG_XEN */
 
 /*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 09b36e9d507..b211a04c4b2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/smp.h>
@@ -38,6 +39,7 @@
 #include <xen/interface/memory.h>
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/hvm.h>
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
@@ -80,6 +82,8 @@ struct shared_info xen_dummy_shared_info;
 void *xen_initial_gdt;
 
 RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 
 /*
  * Point at some empty memory to start with. We map the real shared_info
@@ -1277,6 +1281,24 @@ static void __init init_shared_info(void)
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 }
 
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+	.notifier_call	= xen_hvm_cpu_notify,
+};
+
 static void __init xen_hvm_guest_init(void)
 {
 	int r;
@@ -1287,6 +1309,12 @@ static void __init xen_hvm_guest_init(void)
 		return;
 
 	init_shared_info();
+
+	if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_have_vector_callback = 1;
+	register_cpu_notifier(&xen_hvm_cpu_notifier);
+	have_vcpu_info_placement = 0;
+	x86_init.irqs.intr_init = xen_init_IRQ;
 }
 
 static bool __init xen_hvm_platform(void)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bc..0d0e0e6a747 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -38,6 +38,8 @@ void xen_enable_sysenter(void);
 void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
+void xen_callback_vector(void);
+
 void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_init_irq_ops(void);
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index db8f506817f..d659480125f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -29,6 +29,7 @@
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 
+#include <asm/desc.h>
 #include <asm/ptrace.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
@@ -36,10 +37,14 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
+#include <xen/xen.h>
+#include <xen/hvm.h>
 #include <xen/xen-ops.h>
 #include <xen/events.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
 
 /*
  * This lock protects updates to the following mapping and reference-count
@@ -617,17 +622,13 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count);
  * a bitset of words which contain pending event bits.  The second
  * level is a bitset of pending events themselves.
  */
-void xen_evtchn_do_upcall(struct pt_regs *regs)
+static void __xen_evtchn_do_upcall(void)
 {
 	int cpu = get_cpu();
-	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct shared_info *s = HYPERVISOR_shared_info;
 	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
  	unsigned count;
 
-	exit_idle();
-	irq_enter();
-
 	do {
 		unsigned long pending_words;
 
@@ -667,10 +668,26 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 	} while(count != 1);
 
 out:
+
+	put_cpu();
+}
+
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	exit_idle();
+	irq_enter();
+
+	__xen_evtchn_do_upcall();
+
 	irq_exit();
 	set_irq_regs(old_regs);
+}
 
-	put_cpu();
+void xen_hvm_evtchn_do_upcall(void)
+{
+	__xen_evtchn_do_upcall();
 }
 
 /* Rebind a new event channel to an existing irq. */
@@ -933,6 +950,40 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.retrigger	= retrigger_dynirq,
 };
 
+int xen_set_callback_via(uint64_t via)
+{
+	struct xen_hvm_param a;
+	a.domid = DOMID_SELF;
+	a.index = HVM_PARAM_CALLBACK_IRQ;
+	a.value = via;
+	return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+	int rc;
+	uint64_t callback_via;
+	if (xen_have_vector_callback) {
+		callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+		rc = xen_set_callback_via(callback_via);
+		if (rc) {
+			printk(KERN_ERR "Request for Xen HVM callback vector"
+					" failed.\n");
+			xen_have_vector_callback = 0;
+			return;
+		}
+		printk(KERN_INFO "Xen HVM callback vector for event delivery is "
+				"enabled\n");
+		/* in the restore case the vector has already been allocated */
+		if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
+			alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+	}
+}
+
 void __init xen_init_IRQ(void)
 {
 	int i;
@@ -947,5 +998,10 @@ void __init xen_init_IRQ(void)
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
 		mask_evtchn(i);
 
-	irq_ctx_init(smp_processor_id());
+	if (xen_hvm_domain()) {
+		xen_callback_vector();
+		native_init_IRQ();
+	} else {
+		irq_ctx_init(smp_processor_id());
+	}
 }
diff --git a/include/xen/events.h b/include/xen/events.h
index e68d59a90ca..a15d93262e3 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -56,4 +56,11 @@ void xen_poll_irq(int irq);
 /* Determine the IRQ which is bound to an event channel */
 unsigned irq_from_evtchn(unsigned int evtchn);
 
+/* Xen HVM evtchn vector callback */
+extern void xen_hvm_callback_vector(void);
+extern int xen_have_vector_callback;
+int xen_set_callback_via(uint64_t via);
+void xen_evtchn_do_upcall(struct pt_regs *regs);
+void xen_hvm_evtchn_do_upcall(void);
+
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/hvm.h b/include/xen/hvm.h
index 5dfe8fb86e6..b193fa2f9fd 100644
--- a/include/xen/hvm.h
+++ b/include/xen/hvm.h
@@ -3,6 +3,7 @@
 #define XEN_HVM_H__
 
 #include <xen/interface/hvm/params.h>
+#include <asm/xen/hypercall.h>
 
 static inline int hvm_get_parameter(int idx, uint64_t *value)
 {
@@ -21,4 +22,9 @@ static inline int hvm_get_parameter(int idx, uint64_t *value)
 	return r;
 }
 
+#define HVM_CALLBACK_VIA_TYPE_VECTOR 0x2
+#define HVM_CALLBACK_VIA_TYPE_SHIFT 56
+#define HVM_CALLBACK_VECTOR(x) (((uint64_t)HVM_CALLBACK_VIA_TYPE_VECTOR)<<\
+		HVM_CALLBACK_VIA_TYPE_SHIFT | (x))
+
 #endif /* XEN_HVM_H__ */
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
index f51b6413b05..8ab08b91bf6 100644
--- a/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@ -41,6 +41,9 @@
 /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
 #define XENFEAT_mmu_pt_update_preserve_ad  5
 
+/* x86: Does this Xen host support the HVM callback vector type? */
+#define XENFEAT_hvm_callback_vector        8
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
-- 
cgit v1.2.3-70-g09d2


From b43275d661baa5f1f72dacd9033d6eda09d9fe87 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 26 Jul 2010 10:38:45 -0700
Subject: xen/pvhvm: fix build problem when !CONFIG_XEN

x86_hyper_xen_hvm is only defined when Xen is enabled in the kernel
config.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index bffd47c10fe..5bccedcb912 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,7 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
+#ifdef CONFIG_XEN
 	&x86_hyper_xen_hvm,
+#endif
 };
 
 const struct hypervisor_x86 *x86_hyper;
-- 
cgit v1.2.3-70-g09d2


From 8c73626ab28527b7eb7f3061c027fbfe530c488c Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:18 -0700
Subject: x86: Fix vtime/file timestamp inconsistencies

Due to vtime calling vgettimeofday(), its possible that an application
could call  time();create("stuff",O_RDRW);  only to see the file's
creation timestamp to be before the value returned by time.

A similar way to reproduce the issue is to compare the vsyscall time()
with the syscall time(), and observe ordering issues.

The modified test case from Oleg Nesterov below can illustrate this:

int main(void)
{
	time_t sec1,sec2;
	do {
		sec1 = time(&sec2);
		sec2 = syscall(__NR_time, NULL);
	} while (sec1 <= sec2);

	printf("vtime: %d.000000\n", sec1);
	printf("time: %d.000000\n", sec2);
	return 0;
}

The proper fix is to make vtime use the same time value as
current_kernel_time() (which is exported via update_vsyscall) instead of
vgettime().

Thanks to Jiri Olsa for bringing up the issue and catching bugs in
earlier verisons of this fix.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <1279068988-21864-2-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60..dce0c3c5a78 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
-	struct timeval tv;
+	unsigned seq;
 	time_t result;
 	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
 		return time_syscall(t);
 
-	vgettimeofday(&tv, NULL);
-	result = tv.tv_sec;
+	do {
+		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+		result = __vsyscall_gtod_data.wall_time_sec;
+
+	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
 	if (t)
 		*t = result;
 	return result;
-- 
cgit v1.2.3-70-g09d2


From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:23 -0700
Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset

update_vsyscall() did not provide the wall_to_monotoinc offset,
so arch specific implementations tend to reference wall_to_monotonic
directly. This limits future cleanups in the timekeeping core, so
this patch fixes the update_vsyscall interface to provide
wall_to_monotonic, allowing wall_to_monotonic to be made static
as planned in Documentation/feature-removal-schedule.txt

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tony Luck <tony.luck@intel.com>
LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/kernel/time.c       | 7 ++++---
 arch/powerpc/kernel/time.c    | 8 ++++----
 arch/s390/kernel/time.c       | 8 ++++----
 arch/x86/kernel/vsyscall_64.c | 6 +++---
 include/linux/clocksource.h   | 6 ++++--
 kernel/time/timekeeping.c     | 9 ++++++---
 6 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 653b3c46ea8..ed6f22eb5b1 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -471,7 +471,8 @@ void update_vsyscall_tz(void)
 {
 }
 
-void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
+void update_vsyscall(struct timespec *wall, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
 {
         unsigned long flags;
 
@@ -487,9 +488,9 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
 	/* copy kernel time structures */
         fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
         fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec;
-        fsyscall_gtod_data.monotonic_time.tv_sec = wall_to_monotonic.tv_sec
+	fsyscall_gtod_data.monotonic_time.tv_sec = wtm->tv_sec
 							+ wall->tv_sec;
-        fsyscall_gtod_data.monotonic_time.tv_nsec = wall_to_monotonic.tv_nsec
+	fsyscall_gtod_data.monotonic_time.tv_nsec = wtm->tv_nsec
 							+ wall->tv_nsec;
 
 	/* normalize */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0711d60f40b..e215f76bba1 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -849,8 +849,8 @@ static cycle_t timebase_read(struct clocksource *cs)
 	return (cycle_t)get_tb();
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	u64 new_tb_to_xs, new_stamp_xsec;
 
@@ -882,8 +882,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vdso_data->tb_orig_stamp = clock->cycle_last;
 	vdso_data->stamp_xsec = new_stamp_xsec;
 	vdso_data->tb_to_xs = new_tb_to_xs;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
 	vdso_data->stamp_xtime = *wall_time;
 	smp_wmb();
 	++(vdso_data->tb_update_count);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index a2163c95eb9..aeb30c6f279 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -207,8 +207,8 @@ struct clocksource * __init clocksource_default_clock(void)
 	return &clocksource_tod;
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	if (clock != &clocksource_tod)
 		return;
@@ -219,8 +219,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vdso_data->xtime_tod_stamp = clock->cycle_last;
 	vdso_data->xtime_clock_sec = wall_time->tv_sec;
 	vdso_data->xtime_clock_nsec = wall_time->tv_nsec;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
 	vdso_data->ntp_mult = mult;
 	smp_wmb();
 	++vdso_data->tb_update_count;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dce0c3c5a78..dcbb28c4b69 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	unsigned long flags;
 
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5ea3c60c160..21677d99a16 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -313,11 +313,13 @@ clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void
-update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult);
+update_vsyscall(struct timespec *ts, struct timespec *wtm,
+			struct clocksource *c, u32 mult);
 extern void update_vsyscall_tz(void);
 #else
 static inline void
-update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult)
+update_vsyscall(struct timespec *ts, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
 {
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 73edd4074b5..b15c3acafd5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -170,7 +170,8 @@ void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+			timekeeper.mult);
 }
 
 /**
@@ -326,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -809,7 +811,8 @@ void update_wall_time(void)
 	}
 
 	/* check to see if there is a new clocksource to use */
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From f12a15be63d1de9a35971f35f06b73088fa25c3a Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:27 -0700
Subject: x86: Convert common clocksources to use clocksource_register_hz/khz

This converts the most common of the x86 clocksources over to use
clocksource_register_hz/khz.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-11-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c        | 13 +++++++++----
 arch/x86/kernel/tsc.c         |  5 +----
 drivers/clocksource/acpi_pm.c |  9 ++-------
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d73117..33dbcc4ec5f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
 #include <asm/hpet.h>
 
 #define HPET_MASK			CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT			22
 
 /* FSEC = 10^-15
    NSEC = 10^-9 */
@@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = {
 	.rating		= 250,
 	.read		= read_hpet,
 	.mask		= HPET_MASK,
-	.shift		= HPET_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
 #ifdef CONFIG_X86_64
@@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
 	u64 start, now;
+	u64 hpet_freq;
 	cycle_t t1;
 
 	/* Start the counter */
@@ -832,9 +831,15 @@ static int hpet_clocksource_register(void)
 	 *  mult = (hpet_period * 2^shift)/10^6
 	 *  mult = (hpet_period << shift)/FSEC_PER_NSEC
 	 */
-	clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
 
-	clocksource_register(&clocksource_hpet);
+	/* Need to convert hpet_period (fsec/cyc) to cyc/sec:
+	 *
+	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
+	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
+	 */
+	hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
+	do_div(hpet_freq, hpet_period);
+	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae184..ce8e5023933 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
 	.read                   = read_tsc,
 	.resume			= resume_tsc,
 	.mask                   = CLOCKSOURCE_MASK(64),
-	.shift                  = 22,
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
 
 static void __init init_tsc_clocksource(void)
 {
-	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-			clocksource_tsc.shift);
 	if (tsc_clocksource_reliable)
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
 	/* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
 	}
-	clocksource_register(&clocksource_tsc);
+	clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
 #ifdef CONFIG_X86_64
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 72a633a6ec9..cfb0f527841 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -68,10 +68,7 @@ static struct clocksource clocksource_acpi_pm = {
 	.rating		= 200,
 	.read		= acpi_pm_read,
 	.mask		= (cycle_t)ACPI_PM_MASK,
-	.mult		= 0, /*to be calculated*/
-	.shift		= 22,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-
 };
 
 
@@ -190,9 +187,6 @@ static int __init init_acpi_pm_clocksource(void)
 	if (!pmtmr_ioport)
 		return -ENODEV;
 
-	clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC,
-						clocksource_acpi_pm.shift);
-
 	/* "verify" this timing source: */
 	for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
 		udelay(100 * j);
@@ -220,7 +214,8 @@ static int __init init_acpi_pm_clocksource(void)
 	if (verify_pmtmr_rate() != 0)
 		return -ENODEV;
 
-	return clocksource_register(&clocksource_acpi_pm);
+	return clocksource_register_hz(&clocksource_acpi_pm,
+						PMTMR_TICKS_PER_SEC);
 }
 
 /* We use fs_initcall because we want the PCI fixups to have run
-- 
cgit v1.2.3-70-g09d2


From 80a506b8fdcfa868bb53eb740f928217d0966fc1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 27 Jul 2010 17:14:24 +0200
Subject: x86/amd-iommu: Export cache-coherency capability

This patch exports the capability of the AMD IOMMU to force
cache coherency of DMA transactions through the IOMMU-API.
This is required to disable some nasty hacks in KVM when
this capability is not available.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 29dd3b9f2f0..fa044e1e30a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
 				    unsigned long cap)
 {
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return 1;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 11637e4b7dc098e9a863f0a619d55ebc60f5949e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:25 -0500
Subject: fanotify: fanotify_init syscall declaration

This patch defines a new syscall fanotify_init() of the form:

int sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
		      unsigned int priority)

This syscall is used to create and fanotify group.  This is very similar to
the inotify_init() syscall.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/include/asm/unistd_32.h   |  3 ++-
 arch/x86/include/asm/unistd_64.h   |  2 ++
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/notify/fanotify/Makefile        |  2 +-
 fs/notify/fanotify/fanotify_user.c | 13 +++++++++++++
 include/linux/syscalls.h           |  2 ++
 kernel/sys_ni.c                    |  3 +++
 8 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 fs/notify/fanotify/fanotify_user.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa..586cb3be2e3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,5 @@ ia32_sys_call_table:
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
+	.quad sys_fanotify_init
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a..981c7e7ad80 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,11 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_fanotify_init	338
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 339
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81..4f23e04bdb3 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg				299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_fanotify_init			300
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b372934121..e38793b50e1 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_fanotify_init
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
index e7d39c05b0f..0999213e7e6 100644
--- a/fs/notify/fanotify/Makefile
+++ b/fs/notify/fanotify/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_FANOTIFY)		+= fanotify.o
+obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 00000000000..cf176fc7086
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,13 @@
+#include <linux/fcntl.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include "fanotify.h"
+
+SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
+		unsigned int, priority)
+{
+	return -ENOSYS;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 13ebb5413a7..198dcc9bd02 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -813,6 +813,8 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  struct timespec __user *, const sigset_t __user *,
 			  size_t);
+asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
+				  unsigned int priority);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ff..2c4adc2decc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,6 @@ cond_syscall(sys_eventfd2);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
+
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
-- 
cgit v1.2.3-70-g09d2


From bbaa4168b2d2d8cc674e6d35806e8426aef464b8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:26 -0500
Subject: fanotify: sys_fanotify_mark declartion

This patch simply declares the new sys_fanotify_mark syscall

int fanotify_mark(int fanotify_fd, unsigned int flags, u64_mask,
		  int dfd const char *pathname)

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/ia32/sys_ia32.c           | 9 +++++++++
 arch/x86/include/asm/sys_ia32.h    | 3 +++
 arch/x86/include/asm/unistd_32.h   | 3 ++-
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 fs/notify/fanotify/fanotify_user.c | 6 ++++++
 include/linux/syscalls.h           | 3 +++
 kernel/sys_ni.c                    | 1 +
 9 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 586cb3be2e3..17cf65c9480 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -843,4 +843,5 @@ ia32_sys_call_table:
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
 	.quad sys_fanotify_init
+	.quad sys32_fanotify_mark
 ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88..3d093311d5e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
 	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 			     ((u64)len_hi << 32) | len_lo);
 }
+
+asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
+				    u32 mask_lo, u32 mask_hi,
+				    int fd, const char  __user *pathname)
+{
+	return sys_fanotify_mark(fanotify_fd, flags,
+				 ((u64)mask_hi << 32) | mask_lo,
+				 fd, pathname);
+}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae..cf4e2e381cb 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
 
 /* ia32/ipc32.c */
 asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+
+asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
+				    const char __user *);
 #endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 981c7e7ad80..80b799cd74f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -344,10 +344,11 @@
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
 #define __NR_fanotify_init	338
+#define __NR_fanotify_mark	339
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 339
+#define NR_syscalls 340
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4f23e04bdb3..5b7b1d58561 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -665,6 +665,8 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 #define __NR_fanotify_init			300
 __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark			301
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e38793b50e1..07ad5eb7cc5 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,3 +338,4 @@ ENTRY(sys_call_table)
 	.long sys_perf_event_open
 	.long sys_recvmmsg
 	.long sys_fanotify_init
+	.long sys_fanotify_mark
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 67c0b5e4a48..55d6e379f2b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -70,3 +70,9 @@ out_put_group:
 	fsnotify_put_group(group);
 	return fd;
 }
+
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+		__u64, mask, int, dfd, const char  __user *, pathname)
+{
+	return -ENOSYS;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 198dcc9bd02..5b05c37059e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -815,6 +815,9 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  size_t);
 asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
 				  unsigned int priority);
+asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
+				  u64 mask, int fd,
+				  const char  __user *pathname);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c4adc2decc..bad369ec540 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -184,3 +184,4 @@ cond_syscall(sys_perf_event_open);
 
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
-- 
cgit v1.2.3-70-g09d2


From d78d671db478eb8b14c78501c0cee1cc7baf6967 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Wed, 28 Jul 2010 19:09:30 +0200
Subject: x86, cpu: AMD errata checking framework

Errata are defined using the AMD_LEGACY_ERRATUM() or AMD_OSVW_ERRATUM()
macros. The latter is intended for newer errata that have an OSVW id
assigned, which it takes as first argument. Both take a variable number
of family-specific model-stepping ranges created by AMD_MODEL_RANGE().

Iff an erratum has an OSVW id, OSVW is available on the CPU, and the
OSVW id is known to the hardware, it is used to determine whether an
erratum is present. Otherwise, the model-stepping ranges are matched
against the current CPU to find out whether the erratum applies.

For certain special errata, the code using this framework might have to
conduct further checks to make sure an erratum is really (not) present.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h | 18 ++++++++++++
 arch/x86/kernel/cpu/amd.c        | 60 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8e..5084c2f5ac2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1025,4 +1025,22 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
 	return ratio;
 }
 
+/*
+ * AMD errata checking
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+extern bool cpu_has_amd_erratum(const int *);
+
+#define AMD_LEGACY_ERRATUM(...)		{ -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...)	{ osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+	((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range)	(((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range)	(((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range)	((range) & 0xfff)
+
+#else
+#define cpu_has_amd_erratum(x)	(false)
+#endif /* CONFIG_CPU_SUP_AMD */
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 12b9cff047c..80665410b06 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -609,3 +609,63 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
 };
 
 cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
+ * int[] in arch/x86/include/asm/processor.h.
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ *	AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ *			   AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ *			   AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+bool cpu_has_amd_erratum(const int *erratum)
+{
+	struct cpuinfo_x86 *cpu = &current_cpu_data;
+	int osvw_id = *erratum++;
+	u32 range;
+	u32 ms;
+
+	/*
+	 * If called early enough that current_cpu_data hasn't been initialized
+	 * yet, fall back to boot_cpu_data.
+	 */
+	if (cpu->x86 == 0)
+		cpu = &boot_cpu_data;
+
+	if (cpu->x86_vendor != X86_VENDOR_AMD)
+		return false;
+
+	if (osvw_id >= 0 && osvw_id < 65536 &&
+	    cpu_has(cpu, X86_FEATURE_OSVW)) {
+		u64 osvw_len;
+
+		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+		if (osvw_id < osvw_len) {
+			u64 osvw_bits;
+
+			rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+			    osvw_bits);
+			return osvw_bits & (1ULL << (osvw_id & 0x3f));
+		}
+	}
+
+	/* OSVW unavailable or ID unknown, match family-model-stepping range */
+	ms = (cpu->x86_model << 8) | cpu->x86_mask;
+	while ((range = *erratum++))
+		if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+		    (ms >= AMD_MODEL_RANGE_START(range)) &&
+		    (ms <= AMD_MODEL_RANGE_END(range)))
+			return true;
+
+	return false;
+}
-- 
cgit v1.2.3-70-g09d2


From 9d8888c2a214aece2494a49e699a097c2ba9498b Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Wed, 28 Jul 2010 19:09:31 +0200
Subject: x86, cpu: Clean up AMD erratum 400 workaround

Remove check_c1e_idle() and use the new AMD errata checking framework
instead.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-2-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/amd.c        |  5 +++++
 arch/x86/kernel/process.c        | 39 ++-------------------------------------
 3 files changed, 8 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5084c2f5ac2..eebdc1fde3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
  * AMD errata checking
  */
 #ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_400[];
 extern bool cpu_has_amd_erratum(const int *);
 
 #define AMD_LEGACY_ERRATUM(...)		{ -1, __VA_ARGS__, 0 }
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 80665410b06..a62a4ae7a11 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -628,6 +628,11 @@ cpu_dev_register(amd_cpu_dev);
  *			   AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
  */
 
+const int amd_erratum_400[] =
+	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+
+
 bool cpu_has_amd_erratum(const int *erratum)
 {
 	struct cpuinfo_x86 *cpu = &current_cpu_data;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..553b02f1309 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -525,42 +525,6 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
-/*
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
- */
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
-{
-	u64 val;
-	if (c->x86_vendor != X86_VENDOR_AMD)
-		goto no_c1e_idle;
-
-	/* Family 0x0f models < rev F do not have C1E */
-	if (c->x86 == 0x0F && c->x86_model >= 0x40)
-		return 1;
-
-	if (c->x86 == 0x10) {
-		/*
-		 * check OSVW bit for CPUs that are not affected
-		 * by erratum #400
-		 */
-		if (cpu_has(c, X86_FEATURE_OSVW)) {
-			rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-			if (val >= 2) {
-				rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-				if (!(val & BIT(1)))
-					goto no_c1e_idle;
-			}
-		}
-		return 1;
-	}
-
-no_c1e_idle:
-	return 0;
-}
-
 static cpumask_var_t c1e_mask;
 static int c1e_detected;
 
@@ -638,7 +602,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		 */
 		printk(KERN_INFO "using mwait in idle threads.\n");
 		pm_idle = mwait_idle;
-	} else if (check_c1e_idle(c)) {
+	} else if (cpu_has_amd_erratum(amd_erratum_400)) {
+		/* E400: APIC timer interrupt does not wake up CPU from C1e */
 		printk(KERN_INFO "using C1E aware idle routine\n");
 		pm_idle = c1e_idle;
 	} else
-- 
cgit v1.2.3-70-g09d2


From 1be85a6d93f4207d8c2c6238c4a96895e28cefba Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Wed, 28 Jul 2010 19:09:32 +0200
Subject: x86, cpu: Use AMD errata checking framework for erratum 383

Use the AMD errata checking framework instead of open-coding the test.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-3-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h | 1 +
 arch/x86/kernel/cpu/amd.c        | 2 ++
 arch/x86/kvm/svm.c               | 3 +--
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eebdc1fde3d..d85637bb950 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
  * AMD errata checking
  */
 #ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_383[];
 extern const int amd_erratum_400[];
 extern bool cpu_has_amd_erratum(const int *);
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a62a4ae7a11..30f30dcbdb8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -632,6 +632,8 @@ const int amd_erratum_400[] =
 	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
 			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
 
+const int amd_erratum_383[] =
+	AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
 
 bool cpu_has_amd_erratum(const int *erratum)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd2..03b534b34ee 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -383,8 +383,7 @@ static void svm_init_erratum_383(void)
 	int err;
 	u64 val;
 
-	/* Only Fam10h is affected */
-	if (boot_cpu_data.x86 != 0x10)
+	if (!cpu_has_amd_erratum(amd_erratum_383))
 		return;
 
 	/* Use _safe variants to not break nested virtualization */
-- 
cgit v1.2.3-70-g09d2


From a5b91606bdc9d0a0d036d2d829a22921c705573e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 28 Jul 2010 16:23:20 -0700
Subject: x86, cpu: Export AMD errata definitions

Exprot the AMD errata definitions, since they are needed by kvm_amd.ko
if that is built as a module.  Doing "make allmodconfig" during
testing would have caught this.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Hans Rosenfeld <hans.rosenfeld@amd.com>
LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com>
---
 arch/x86/kernel/cpu/amd.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 30f30dcbdb8..60a57b13082 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -631,9 +631,11 @@ cpu_dev_register(amd_cpu_dev);
 const int amd_erratum_400[] =
 	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
 			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_400);
 
 const int amd_erratum_383[] =
 	AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_383);
 
 bool cpu_has_amd_erratum(const int *erratum)
 {
@@ -676,3 +678,5 @@ bool cpu_has_amd_erratum(const int *erratum)
 
 	return false;
 }
+
+EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
-- 
cgit v1.2.3-70-g09d2


From 90c8f92f5c807807ca74d5f2f313794925174e6b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 28 Jul 2010 16:53:49 -0700
Subject: x86, asm: Move cmpxchg emulation code to arch/x86/lib

Move cmpxchg emulation code from arch/x86/kernel/cpu (which is
otherwise CPU identification) to arch/x86/lib, where other emulation
code lives already.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <AANLkTikAmaDPji-TVDarmG1yD=fwbffcsmEU=YEuP+8r@mail.gmail.com>
---
 arch/x86/kernel/cpu/Makefile  |  2 +-
 arch/x86/kernel/cpu/cmpxchg.c | 72 -------------------------------------------
 arch/x86/lib/Makefile         |  1 +
 arch/x86/lib/cmpxchg.c        | 72 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 74 insertions(+), 73 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/cmpxchg.c
 create mode 100644 arch/x86/lib/cmpxchg.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6..c47c43914ba 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,7 +16,7 @@ obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
-obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
+obj-$(CONFIG_X86_32)	+= bugs.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572c..00000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * cmpxchg*() fallbacks for CPU not supporting these instructions
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
-	u8 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u8 *)ptr;
-	if (prev == old)
-		*(u8 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
-	u16 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u16 *)ptr;
-	if (prev == old)
-		*(u16 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
-	u32 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u32 *)ptr;
-	if (prev == old)
-		*(u32 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
-	u64 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u64 *)ptr;
-	if (prev == old)
-		*(u64 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b696..e10cf070ede 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
         lib-y += checksum_32.o
         lib-y += strstr_32.o
         lib-y += semaphore_32.o string_32.o
+        lib-y += cmpxchg.o
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += cmpxchg8b_emu.o atomic64_386_32.o
 endif
diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c
new file mode 100644
index 00000000000..2056ccf572c
--- /dev/null
+++ b/arch/x86/lib/cmpxchg.c
@@ -0,0 +1,72 @@
+/*
+ * cmpxchg*() fallbacks for CPU not supporting these instructions
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+	u8 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u8 *)ptr;
+	if (prev == old)
+		*(u8 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+	u16 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u16 *)ptr;
+	if (prev == old)
+		*(u16 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+	u32 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u32 *)ptr;
+	if (prev == old)
+		*(u32 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+	u64 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u64 *)ptr;
+	if (prev == old)
+		*(u64 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
-- 
cgit v1.2.3-70-g09d2


From ca65f9fc0c447da5b270b05c41c21b19c88617c3 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 29 Jul 2010 14:37:48 +0100
Subject: Introduce CONFIG_XEN_PVHVM compile option

This patch introduce a CONFIG_XEN_PVHVM compile time option to
enable/disable Xen PV on HVM support.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/kernel/cpu/hypervisor.c   | 2 +-
 arch/x86/xen/Kconfig               | 5 +++++
 arch/x86/xen/enlighten.c           | 2 ++
 arch/x86/xen/mmu.c                 | 2 ++
 arch/x86/xen/platform-pci-unplug.c | 2 ++
 arch/x86/xen/time.c                | 3 ++-
 drivers/xen/Kconfig                | 2 +-
 drivers/xen/events.c               | 4 ++++
 8 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 5bccedcb912..8095f8611f8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,7 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PVHVM
 	&x86_hyper_xen_hvm,
 #endif
 };
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb..68128a1b401 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,11 @@ config XEN
 	  kernel to boot in a paravirtualized environment under the
 	  Xen hypervisor.
 
+config XEN_PVHVM
+	def_bool y
+	depends on XEN
+	depends on X86_LOCAL_APIC
+
 config XEN_MAX_DOMAIN_MEMORY
        int "Maximum allowed size of a domain in gigabytes"
        default 8 if X86_32
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75b479a684f..6f5345378ab 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1282,6 +1282,7 @@ void xen_hvm_init_shared_info(void)
 	}
 }
 
+#ifdef CONFIG_XEN_PVHVM
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
@@ -1338,3 +1339,4 @@ const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
 	.init_platform		= xen_hvm_guest_init,
 };
 EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 84648c1bf13..413b19b3d0f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1942,6 +1942,7 @@ void __init xen_init_mmu_ops(void)
 	pv_mmu_ops = xen_mmu_ops;
 }
 
+#ifdef CONFIG_XEN_PVHVM
 static void xen_hvm_exit_mmap(struct mm_struct *mm)
 {
 	struct xen_hvm_pagetable_dying a;
@@ -1973,6 +1974,7 @@ void __init xen_hvm_init_mmu_ops(void)
 	if (is_pagetable_dying_supported())
 		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
 }
+#endif
 
 #ifdef CONFIG_XEN_DEBUG_FS
 
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 2f7f3fb3477..554c002a1e1 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -32,6 +32,7 @@
 /* store the value of xen_emul_unplug after the unplug is done */
 int xen_platform_pci_unplug;
 EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+#ifdef CONFIG_XEN_PVHVM
 static int xen_emul_unplug;
 
 static int __init check_platform_magic(void)
@@ -133,3 +134,4 @@ static int __init parse_xen_emul_unplug(char *arg)
 	return 0;
 }
 early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 4780e55886a..2aab4a2b910 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -516,6 +516,7 @@ __init void xen_init_time_ops(void)
 	x86_platform.set_wallclock = xen_set_wallclock;
 }
 
+#ifdef CONFIG_XEN_PVHVM
 static void xen_hvm_setup_cpu_clockevents(void)
 {
 	int cpu = smp_processor_id();
@@ -544,4 +545,4 @@ __init void xen_hvm_init_time_ops(void)
 	x86_platform.get_wallclock = xen_get_wallclock;
 	x86_platform.set_wallclock = xen_set_wallclock;
 }
-
+#endif
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 8f84b108b49..0a882693663 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -64,7 +64,7 @@ config XEN_SYS_HYPERVISOR
 
 config XEN_PLATFORM_PCI
 	tristate "xen platform pci device driver"
-	depends on XEN
+	depends on XEN_PVHVM
 	default m
 	help
 	  Driver for the Xen PCI Platform device: it is responsible for
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index b5a254e9aeb..5e1f34892dc 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -973,6 +973,7 @@ int xen_set_callback_via(uint64_t via)
 }
 EXPORT_SYMBOL_GPL(xen_set_callback_via);
 
+#ifdef CONFIG_XEN_PVHVM
 /* Vector callbacks are better than PCI interrupts to receive event
  * channel notifications because we can receive vector callbacks on any
  * vcpu and we don't need PCI support or APIC interactions. */
@@ -996,6 +997,9 @@ void xen_callback_vector(void)
 			alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
 	}
 }
+#else
+void xen_callback_vector(void) {}
+#endif
 
 void __init xen_init_IRQ(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 23 Jul 2010 14:56:28 +0100
Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc()

commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove
unsafe and unnecessary hardware access" changed read_msi_msg_desc() to
return the last MSI message written instead of reading it from the
device, since it may be called while the device is in a reduced
power state.

However, the pSeries platform code really does need to read messages
from the device, since they are initially written by firmware.
Therefore:
- Restore the previous behaviour of read_msi_msg_desc()
- Add new functions get_cached_msi_msg{,_desc}() which return the
  last MSI message written
- Use the new functions where appropriate

Acked-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/ia64/kernel/msi_ia64.c    |  2 +-
 arch/ia64/sn/kernel/msi_sn.c   |  2 +-
 arch/x86/kernel/apic/io_apic.c |  2 +-
 drivers/pci/msi.c              | 47 +++++++++++++++++++++++++++++++++++++-----
 include/linux/msi.h            |  2 ++
 5 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 6c892285604..4a746ea838f 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -25,7 +25,7 @@ static int ia64_set_msi_irq_affinity(unsigned int irq,
 	if (irq_prepare_move(irq, cpu))
 		return -1;
 
-	read_msi_msg(irq, &msg);
+	get_cached_msi_msg(irq, &msg);
 
 	addr = msg.address_lo;
 	addr &= MSI_ADDR_DEST_ID_MASK;
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ebfdd6a9ae1..0c72dd46383 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -175,7 +175,7 @@ static int sn_set_msi_irq_affinity(unsigned int irq,
 	 * Release XIO resources for the old MSI PCI address
 	 */
 
-	read_msi_msg(irq, &msg);
+	get_cached_msi_msg(irq, &msg);
         sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
 	pdev = sn_pdev->pdi_linux_pcidev;
 	provider = SN_PCIDEV_BUSPROVIDER(pdev);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26..4dc0084ec1b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cfg = desc->chip_data;
 
-	read_msi_msg_desc(desc, &msg);
+	get_cached_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c14f31f2b4..69b7be33b3a 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -197,9 +197,46 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
 
-	/* We do not touch the hardware (which may not even be
-	 * accessible at the moment) but return the last message
-	 * written.  Assert that this is valid, assuming that
+	BUG_ON(entry->dev->current_state != PCI_D0);
+
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
+	} else {
+		struct pci_dev *dev = entry->dev;
+		int pos = entry->msi_attrib.pos;
+		u16 data;
+
+		pci_read_config_dword(dev, msi_lower_address_reg(pos),
+					&msg->address_lo);
+		if (entry->msi_attrib.is_64) {
+			pci_read_config_dword(dev, msi_upper_address_reg(pos),
+						&msg->address_hi);
+			pci_read_config_word(dev, msi_data_reg(pos, 1), &data);
+		} else {
+			msg->address_hi = 0;
+			pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
+		}
+		msg->data = data;
+	}
+}
+
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	read_msi_msg_desc(desc, msg);
+}
+
+void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+{
+	struct msi_desc *entry = get_irq_desc_msi(desc);
+
+	/* Assert that the cache is valid, assuming that
 	 * valid messages are not all-zeroes. */
 	BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
 		 entry->msg.data));
@@ -207,11 +244,11 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 	*msg = entry->msg;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	read_msi_msg_desc(desc, msg);
+	get_cached_msi_msg_desc(desc, msg);
 }
 
 void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6991ab5b24d..91b05c17185 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -14,8 +14,10 @@ struct irq_desc;
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
 extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
+extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
-- 
cgit v1.2.3-70-g09d2


From 68f202e4e87cfab4439568bf397fcc5c7cf8d729 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 30 Jul 2010 11:46:42 -0700
Subject: x86, mtrr: Use stop machine context to rendezvous all the cpu's

Use the stop machine context rather than IPI's to rendezvous all the cpus for
MTRR initialization that happens during cpu bringup or for MTRR modifications
during runtime.

This avoids deadlock scenario (reported by Prarit) like:

cpu A holds a read_lock (tasklist_lock for example) with irqs enabled
cpu B waits for the same lock with irqs disabled using write_lock_irq
cpu C doing set_mtrr() (during AP bringup for example), which will try to
rendezvous all the cpus using IPI's

This will result in C and A come to the rendezvous point and waiting
for B. B is stuck forever waiting for the lock and thus not
reaching the rendezvous point.

Using stop cpu (run in the process context of per cpu based keventd) to do
this rendezvous, avoids this deadlock scenario.

Also make sure all the cpu's are in the rendezvous handler before we proceed
with the local_irq_save() on each cpu. This lock step disabling irqs on all
the cpus will avoid other deadlock scenarios (for example involving
with the blocking smp_call_function's etc).

   [ This problem is very old. Marking -stable only for 2.6.35 as the
     stop_one_cpu_nowait() API is present only in 2.6.35. Any older
     kernel interested in this fix need to do some more work in backporting
     this patch. ]

Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1280515602.2682.10.camel@sbsiddha-MOBL3.sc.intel.com>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@kernel.org	[2.6.35]
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/main.c | 56 +++++++++++++++++++++++++++++++----------
 arch/x86/kernel/smpboot.c       |  7 ++++++
 2 files changed, 50 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b60..01c0f3ee6cc 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
 
 #include <linux/types.h> /* FIXME: kvm_para.h needs this */
 
+#include <linux/stop_machine.h>
 #include <linux/kvm_para.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
 	mtrr_type	smp_type;
 };
 
+static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
+
 /**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
  * @info: pointer to mtrr configuration data
  *
  * Returns nothing.
  */
-static void ipi_handler(void *info)
+static int mtrr_work_handler(void *info)
 {
 #ifdef CONFIG_SMP
 	struct set_mtrr_data *data = info;
 	unsigned long flags;
 
+	atomic_dec(&data->count);
+	while (!atomic_read(&data->gate))
+		cpu_relax();
+
 	local_irq_save(flags);
 
 	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
+	while (atomic_read(&data->gate))
 		cpu_relax();
 
 	/*  The master has cleared me to execute  */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
 	}
 
 	atomic_dec(&data->count);
-	while (atomic_read(&data->gate))
+	while (!atomic_read(&data->gate))
 		cpu_relax();
 
 	atomic_dec(&data->count);
 	local_irq_restore(flags);
 #endif
+	return 0;
 }
 
 static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  *
  * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
  *
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
  * 2. Disable Interrupts
  * 3. Wait for all procs to do so
  * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  * 15. Enable interrupts.
  *
  * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
+ * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * decrementing the count, We reset data.count and set the data.gate flag
+ * allowing all the cpu's to proceed with the work. As each cpu disables
+ * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
+ * are waiting for that flag to be cleared. Once it's cleared, each
  * CPU goes through the transition of updating MTRRs.
  * The CPU vendors may each do it differently,
  * so we call mtrr_if->set() callback and let them take care of it.
  * When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
+ * to be set.
  * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
  * Everyone then enables interrupts and we all continue on.
  *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 {
 	struct set_mtrr_data data;
 	unsigned long flags;
+	int cpu;
+
+	preempt_disable();
 
 	data.smp_reg = reg;
 	data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	atomic_set(&data.gate, 0);
 
 	/* Start the ball rolling on other CPUs */
-	if (smp_call_function(ipi_handler, &data, 0) != 0)
-		panic("mtrr: timed out waiting for other CPUs\n");
+	for_each_online_cpu(cpu) {
+		struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
+
+		if (cpu == smp_processor_id())
+			continue;
+
+		stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
+	}
 
-	local_irq_save(flags);
 
 	while (atomic_read(&data.count))
 		cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	smp_wmb();
 	atomic_set(&data.gate, 1);
 
+	local_irq_save(flags);
+
+	while (atomic_read(&data.count))
+		cpu_relax();
+
+	/* Ok, reset count and toggle gate */
+	atomic_set(&data.count, num_booting_cpus() - 1);
+	smp_wmb();
+	atomic_set(&data.gate, 0);
+
 	/* Do our MTRR business */
 
 	/*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 
 	atomic_set(&data.count, num_booting_cpus() - 1);
 	smp_wmb();
-	atomic_set(&data.gate, 0);
+	atomic_set(&data.gate, 1);
 
 	/*
 	 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 		cpu_relax();
 
 	local_irq_restore(flags);
+	preempt_enable();
 }
 
 /**
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d..11015fd1abb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -816,6 +816,13 @@ do_rest:
 			if (cpumask_test_cpu(cpu, cpu_callin_mask))
 				break;	/* It has booted */
 			udelay(100);
+			/*
+			 * Allow other tasks to run while we wait for the
+			 * AP to come online. This also gives a chance
+			 * for the MTRR work(triggered by the AP coming online)
+			 * to be completed in the stop machine context.
+			 */
+			schedule();
 		}
 
 		if (cpumask_test_cpu(cpu, cpu_callin_mask))
-- 
cgit v1.2.3-70-g09d2


From 9792db6174d9927700ed288e6d74b9391bf785d1 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:42 -0700
Subject: x86, cpu: Package Level Thermal Control, Power Limit Notification
 definitions

Add package level thermal and power limit feature support.

The two MSRs and features are new starting with Intel's Sandy Bridge processor.

Please check Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit
Notification and 14.6 Package Level Thermal Management.

This patch also fixes a bug which defines reverse THERM_INT_LOW_ENABLE bit and
THERM_INT_HIGH_ENABLE bit.

[ hpa: fixed up against current tip:x86/cpu ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-2-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h |  2 ++
 arch/x86/include/asm/msr-index.h  | 17 +++++++++++++++--
 arch/x86/kernel/cpu/scattered.c   |  2 ++
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4be50ddd4d7..817aa316b18 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -166,6 +166,8 @@
 #define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		(7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 #define X86_FEATURE_XSAVEOPT	(7*32+ 4) /* Optimized Xsave */
+#define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 7cc4a026331..4ea2a7ca7a4 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -224,12 +224,14 @@
 #define MSR_IA32_THERM_CONTROL		0x0000019a
 #define MSR_IA32_THERM_INTERRUPT	0x0000019b
 
-#define THERM_INT_LOW_ENABLE		(1 << 0)
-#define THERM_INT_HIGH_ENABLE		(1 << 1)
+#define THERM_INT_HIGH_ENABLE		(1 << 0)
+#define THERM_INT_LOW_ENABLE		(1 << 1)
+#define THERM_INT_PLN_ENABLE		(1 << 24)
 
 #define MSR_IA32_THERM_STATUS		0x0000019c
 
 #define THERM_STATUS_PROCHOT		(1 << 0)
+#define THERM_STATUS_POWER_LIMIT	(1 << 10)
 
 #define MSR_THERM2_CTL			0x0000019d
 
@@ -241,6 +243,17 @@
 
 #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
 
+#define MSR_IA32_PACKAGE_THERM_STATUS		0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT		(1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT	(1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT	0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE		(1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE		(1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE		(1 << 24)
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 9815364b477..34b4dad6f0b 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -33,6 +33,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
 		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
 		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
+		{ X86_FEATURE_PLN,		CR_EAX, 4, 0x00000006, 0 },
+		{ X86_FEATURE_PTS,		CR_EAX, 6, 0x00000006, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
-- 
cgit v1.2.3-70-g09d2


From 25971865d48a8d0ece5307a59dbd3f06d05a7567 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 16 Jun 2010 23:19:28 -0400
Subject: x86, olpc: Use pr_debug() for EC commands

Unconditionally printing EC debug messages was helpful when we were actually
debugging the EC, but during normal operation it can get pretty annoying.
Using pr_debug allows us finer-grained control.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20100616231928.16b539f0@dev.queued.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/olpc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 156605281f5..f5ff3903b38 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -142,7 +142,7 @@ restart:
 	 * The OBF flag will sometimes misbehave due to what we believe
 	 * is a hardware quirk..
 	 */
-	printk(KERN_DEBUG "olpc-ec:  running cmd 0x%x\n", cmd);
+	pr_devel("olpc-ec:  running cmd 0x%x\n", cmd);
 	outb(cmd, 0x6c);
 
 	if (wait_on_ibf(0x6c, 0)) {
@@ -159,8 +159,7 @@ restart:
 						" EC accept data!\n");
 				goto err;
 			}
-			printk(KERN_DEBUG "olpc-ec:  sending cmd arg 0x%x\n",
-					inbuf[i]);
+			pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
 			outb(inbuf[i], 0x68);
 		}
 	}
@@ -173,8 +172,7 @@ restart:
 				goto restart;
 			}
 			outbuf[i] = inb(0x68);
-			printk(KERN_DEBUG "olpc-ec:  received 0x%x\n",
-					outbuf[i]);
+			pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 54e5bc020ce1c959eaa7be18cedb734b6b13745e Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Mon, 28 Jun 2010 22:00:29 -0400
Subject: x86, olpc: Constify an olpc_ofw() arg

The arguments passed to OFW shouldn't be modified; update the 'args'
argument of olpc_ofw to reflect this.  This saves us some later
casting away of consts.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20100628220029.1555ac24@debian>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/olpc_ofw.h | 2 +-
 arch/x86/kernel/olpc.c          | 2 +-
 arch/x86/kernel/olpc_ofw.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 3e63d857c48..08fde475cb3 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -12,7 +12,7 @@
 #define olpc_ofw(name, args, res) \
 	__olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
 
-extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
 		void **res);
 
 /* determine whether OFW is available and lives in the proper memory */
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index f5ff3903b38..0e0cdde519b 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -188,7 +188,7 @@ static void __init platform_detect(void)
 {
 	size_t propsize;
 	__be32 rev;
-	void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+	const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
 	void *res[] = { &propsize };
 
 	if (olpc_ofw("getprop", args, res) || propsize != 4) {
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index f5d499fbe74..3218aa71ab5 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -40,7 +40,7 @@ void __init setup_olpc_ofw_pgd(void)
 	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
 }
 
-int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res,
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
 		void **res)
 {
 	int ofw_args[MAXARGS + 3];
-- 
cgit v1.2.3-70-g09d2


From c4026cfd8febcd63dd278894108839f30e525a0e Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 30 Jul 2010 14:10:55 -0500
Subject: x86, UV: Initialize BAU hub map

Fix uninitialized uvhub_mask:

- An unitialized bit map variable was causing initialization of
  non-existant hubs (this one causes boot panics).

- And the bit map was too small for large machines.  This patch
  makes it dynamic in size.

- Fix the case where socket 0 has no enabled cpu's. Don't assume
  every hub has a socket 0.

- uv_init_per_cpu() should be __init.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org> # for .35.x
LKML-Reference: <E1Oeuyt-0004XS-0y@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 59efb5390b3..312ef029281 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1484,15 +1484,16 @@ calculate_destination_timeout(void)
 /*
  * initialize the bau_control structure for each cpu
  */
-static void uv_init_per_cpu(int nuvhubs)
+static void __init uv_init_per_cpu(int nuvhubs)
 {
 	int i;
 	int cpu;
 	int pnode;
 	int uvhub;
+	int have_hmaster;
 	short socket = 0;
 	unsigned short socket_mask;
-	unsigned int uvhub_mask;
+	unsigned char *uvhub_mask;
 	struct bau_control *bcp;
 	struct uvhub_desc *bdp;
 	struct socket_desc *sdp;
@@ -1516,28 +1517,29 @@ static void uv_init_per_cpu(int nuvhubs)
 	uvhub_descs = (struct uvhub_desc *)
 		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
 	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
 		pnode = uv_cpu_hub_info(cpu)->pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
-		uvhub_mask |= (1 << uvhub);
+		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
 		bdp = &uvhub_descs[uvhub];
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
 		/* kludge: 'assuming' one node per socket, and assuming that
 		   disabling a socket just leaves a gap in node numbers */
-		socket = (cpu_to_node(cpu) & 1);;
+		socket = (cpu_to_node(cpu) & 1);
 		bdp->socket_mask |= (1 << socket);
 		sdp = &bdp->socket[socket];
 		sdp->cpu_number[sdp->num_cpus] = cpu;
 		sdp->num_cpus++;
 	}
-	uvhub = 0;
-	while (uvhub_mask) {
-		if (!(uvhub_mask & 1))
-			goto nexthub;
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+			continue;
+		have_hmaster = 0;
 		bdp = &uvhub_descs[uvhub];
 		socket_mask = bdp->socket_mask;
 		socket = 0;
@@ -1551,8 +1553,10 @@ static void uv_init_per_cpu(int nuvhubs)
 				bcp->cpu = cpu;
 				if (i == 0) {
 					smaster = bcp;
-					if (socket == 0)
+					if (!have_hmaster) {
+						have_hmaster++;
 						hmaster = bcp;
+					}
 				}
 				bcp->cpus_in_uvhub = bdp->num_cpus;
 				bcp->cpus_in_socket = sdp->num_cpus;
@@ -1566,11 +1570,9 @@ nextsocket:
 			socket++;
 			socket_mask = (socket_mask >> 1);
 		}
-nexthub:
-		uvhub++;
-		uvhub_mask = (uvhub_mask >> 1);
 	}
 	kfree(uvhub_descs);
+	kfree(uvhub_mask);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		bcp->baudisabled = 0;
-- 
cgit v1.2.3-70-g09d2


From 5ee481da7b62a992b91f958bf26aaaa92354c170 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:22:23 +0800
Subject: x86: Export FPU API for KVM use

Also add some constants.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/i387.h  | 2 ++
 arch/x86/include/asm/xsave.h | 3 +++
 arch/x86/kernel/i387.c       | 3 ++-
 arch/x86/kernel/process.c    | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b90..815c5b2b9f5 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
 	memcpy(dst->state, src->state, xstate_size);
 }
 
+extern void fpu_finit(struct fpu *fpu);
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae22..29ee4e4c64c 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,9 @@
 
 #define FXSAVE_SIZE	512
 
+#define XSTATE_YMM_SIZE 256
+#define XSTATE_YMM_OFFSET (512 + 64)
+
 /*
  * These are the features that the OS can handle currently.
  */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b3225..c4444bce846 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
 {
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
 		fp->fos = 0xffff0000u;
 	}
 }
+EXPORT_SYMBOL_GPL(fpu_finit);
 
 /*
  * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..ebcfcceccc7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-- 
cgit v1.2.3-70-g09d2


From c15a5958a0b6dbf06b3c05972694f04a0c50a4cf Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 31 Jul 2010 12:48:22 -0400
Subject: x86-64, asm: Directly access per-cpu IST

Use a direct per-cpu reference for the IST instead of using a scratch
register.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1280594903-6341-1-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ff..59af275b37a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
 .macro paranoidzeroentry_ist sym do_sym ist
 ENTRY(\sym)
 	INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
 	TRACE_IRQS_OFF
 	movq %rsp,%rdi		/* pt_regs pointer */
 	xorl %esi,%esi		/* no error code */
-	PER_CPU(init_tss, %r12)
-	subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
 	call \do_sym
-	addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
 	jmp paranoid_exit	/* %ebx: no swapgs flag */
 	CFI_ENDPROC
 END(\sym)
-- 
cgit v1.2.3-70-g09d2


From 72c511dd596cff88d6523f231a0fbb8f73006d51 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sat, 31 Jul 2010 12:48:23 -0400
Subject: x86-32, asm: Directly access per-cpu GDT

Use a direct per-cpu reference for the GDT instead of using a scratch
register.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1280594903-6341-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_32.S | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf15..233c5829e7a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
  * compensating for the offset by changing to the ESPFIX segment with
  * a base address that matches for the difference.
  */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
 	mov %esp, %edx			/* load kernel esp */
 	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
 	mov %dx, %ax			/* eax: new kernel esp */
 	sub %eax, %edx			/* offset (low word is 0) */
-	PER_CPU(gdt_page, %ebx)
 	shr $16, %edx
-	mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
-	mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
+	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
 	pushl $__ESPFIX_SS
 	CFI_ADJUST_CFA_OFFSET 4
 	push %eax			/* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
  * normal stack and adjusts ESP with the matching offset.
  */
 	/* fixup the stack */
-	PER_CPU(gdt_page, %ebx)
-	mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
-	mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
 	shl $16, %eax
 	addl %esp, %eax			/* the adjusted stack pointer */
 	pushl $__KERNEL_DS
-- 
cgit v1.2.3-70-g09d2


From e8c534ec068af1a0845aceda373a9bfd2de62030 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Tue, 27 Jul 2010 18:53:35 +0200
Subject: x86: Fix keeping track of AMD C1E

Accomodate the original C1E-aware idle routine to the different times
during boot when the BIOS enables C1E. While at it, remove the synthetic
CPUID flag in favor of a single global setting which denotes C1E status
on the system.

[ hpa: changed c1e_enabled to be a bool; clarified cpu bit 3:21 comment ]

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
LKML-Reference: <20100727165335.GA11630@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/acpi.h       | 2 +-
 arch/x86/include/asm/cpufeature.h | 2 +-
 arch/x86/include/asm/processor.h  | 1 +
 arch/x86/kernel/process.c         | 8 +++++---
 drivers/acpi/processor_idle.c     | 2 +-
 5 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968f..92091de1111 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 	    boot_cpu_data.x86_model <= 0x05 &&
 	    boot_cpu_data.x86_mask < 0x0A)
 		return 1;
-	else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+	else if (c1e_detected)
 		return 1;
 	else
 		return max_cstate;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 817aa316b18..0b205b8a430 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -89,7 +89,7 @@
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
 #define X86_FEATURE_11AP	(3*32+19) /* "" Bad local APIC aka 11AP */
 #define X86_FEATURE_NOPL	(3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_AMDC1E	(3*32+21) /* AMD C1E detected */
+					  /* 21 available, was AMD_C1E */
 #define X86_FEATURE_XTOPOLOGY	(3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	(3*32+24) /* TSC does not stop in C states */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d85637bb950..325b7bdbeba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -762,6 +762,7 @@ extern void init_c1e_mask(void);
 extern unsigned long		boot_option_idle_override;
 extern unsigned long		idle_halt;
 extern unsigned long		idle_nomwait;
+extern bool			c1e_detected;
 
 /*
  * on systems with caches, caches must be flashed as the absolute
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 553b02f1309..b944f89c4e6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -525,8 +525,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
+bool c1e_detected;
+EXPORT_SYMBOL(c1e_detected);
+
 static cpumask_var_t c1e_mask;
-static int c1e_detected;
 
 void c1e_remove_cpu(int cpu)
 {
@@ -548,12 +550,12 @@ static void c1e_idle(void)
 		u32 lo, hi;
 
 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-			c1e_detected = 1;
+			c1e_detected = true;
 			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 				mark_tsc_unstable("TSC halt in AMD C1E");
 			printk(KERN_INFO "System has AMD C1E enabled\n");
-			set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
 		}
 	}
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index e9a8026d39f..eead3f581fb 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -164,7 +164,7 @@ static void lapic_timer_check_state(int state, struct acpi_processor *pr,
 	if (cpu_has(&cpu_data(pr->id), X86_FEATURE_ARAT))
 		return;
 
-	if (boot_cpu_has(X86_FEATURE_AMDC1E))
+	if (c1e_detected)
 		type = ACPI_STATE_C1;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From fe96eb404e33b59bb39f7050205f7c56c1c7d686 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 18 Mar 2010 13:53:24 -0400
Subject: x86: Detect whether we should use Xen SWIOTLB.

It is paramount that we call pci_xen_swiotlb_detect before
pci_swiotlb_detect as both implementations use the 'swiotlb'
and 'swiotlb_force' flags. The pci-xen_swiotlb_detect inhibits
the swiotlb_force and swiotlb flag so that the native SWIOTLB
implementation is not enabled when running under Xen.

[since v1 changed two Cc's to Acked-by]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
    [http://lkml.org/lkml/2010/7/27/374]
Cc: Albert Herranz <albert_herranz@yahoo.es>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
    [conditional http://lkml.org/lkml/2010/8/2/324]
Cc: x86@kernel.org
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/pci-dma.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01d..9f07cfcbd3a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 #include <asm/x86_init.h>
+#include <asm/xen/swiotlb-xen.h>
 
 static int forbid_dac __read_mostly;
 
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
 	/* free the range so iommu could get some range less than 4G */
 	dma32_free_bootmem();
 
-	if (pci_swiotlb_detect())
+	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
 		goto out;
 
 	gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
 	/* needs to be called after gart_iommu_hole_init */
 	amd_iommu_detect();
 out:
+	pci_xen_swiotlb_init();
+
 	pci_swiotlb_init();
 }
 
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
 #endif
 	x86_init.iommu.iommu_init();
 
-	if (swiotlb) {
+	if (swiotlb || xen_swiotlb) {
 		printk(KERN_INFO "PCI-DMA: "
 		       "Using software bounce buffering for IO (SWIOTLB)\n");
 		swiotlb_print_info();
-- 
cgit v1.2.3-70-g09d2


From 9f242dc10e0c3c1eb32d8c83c18650a35fd7f80d Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 2 Aug 2010 16:10:37 -0700
Subject: x86, vmware: Preset lpj values when on VMware.

When running on VMware's platform, we have seen situations where
the AP's try to calibrate the lpj values and fail to get good calibration
runs becasue of timing issues. As a result delays don't work correctly
on all cpus.

The solutions is to set preset_lpj value based on the current tsc frequency
value. This is similar to what KVM does as well.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
LKML-Reference: <1280790637.14933.29.camel@ank32.eng.vmware.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/vmware.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff58844..227b0448960 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
 
 static unsigned long vmware_get_tsc_khz(void)
 {
-	uint64_t tsc_hz;
+	uint64_t tsc_hz, lpj;
 	uint32_t eax, ebx, ecx, edx;
 
 	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
 	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
 			 (unsigned long) tsc_hz / 1000,
 			 (unsigned long) tsc_hz % 1000);
+
+	if (!preset_lpj) {
+		lpj = ((u64)tsc_hz * 1000);
+		do_div(lpj, HZ);
+		preset_lpj = lpj;
+	}
+
 	return tsc_hz;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 18 May 2010 13:59:05 +0200
Subject: x86, mce: Notify about corrected events too

Notify all parties registered on the mce decoder chain about logged
correctable MCEs.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Doug Thompson <dougthompson@xmission.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc4256225..1970ef911c9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -600,6 +600,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 */
 		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 			mce_log(&m);
+			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
 			add_taint(TAINT_MACHINE_CHECK);
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 5d77b85458f656923b85291a4ff56ed44859ed52 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Tue, 20 Jul 2010 13:52:00 -0400
Subject: [CPUFREQ] pcc driver should check for pcch method before calling _OSC

The pcc specification documents an _OSC method that's incompatible with the
one defined as part of the ACPI spec. This shouldn't be a problem as both
are supposed to be guarded with a UUID. Unfortunately approximately nobody
(including HP, who wrote this spec) properly check the UUID on entry to the
_OSC call. Right now this could result in surprising behaviour if the pcc
driver performs an _OSC call on a machine that doesn't implement the pcc
specification. Check whether the PCCH method exists first in order to reduce
this probability.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Cc: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index ce7cde713e7..01bd25c3c7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -397,13 +397,17 @@ static int __init pcc_cpufreq_probe(void)
 	struct pcc_memory_resource *mem_resource;
 	struct pcc_register_resource *reg_resource;
 	union acpi_object *out_obj, *member;
-	acpi_handle handle, osc_handle;
+	acpi_handle handle, osc_handle, pcch_handle;
 	int ret = 0;
 
 	status = acpi_get_handle(NULL, "\\_SB", &handle);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
 
+	status = acpi_get_handle(handle, "PCCH", &pcch_handle);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
 	status = acpi_get_handle(handle, "_OSC", &osc_handle);
 	if (ACPI_SUCCESS(status)) {
 		ret = pcc_cpufreq_do_osc(&osc_handle);
-- 
cgit v1.2.3-70-g09d2


From 0d9715d64fe118dd0957a29e344972b8d3f960e7 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Fri, 23 Jul 2010 23:06:52 +0100
Subject: [CPUFREQ] fix double freeing in error path of pcc-cpufreq

Prevent double freeing on error path.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 01bd25c3c7c..900702888bf 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
 		return -ENODEV;
 
 	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
+	if (out_obj->type != ACPI_TYPE_BUFFER)
+		return -ENODEV;
 
 	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		ret = -ENODEV;
-		goto out_free;
-	}
+	if (errors)
+		return -ENODEV;
 
 	supported = *((u32 *)(out_obj->buffer.pointer + 4));
-	if (!(supported & 0x1)) {
-		ret = -ENODEV;
-		goto out_free;
-	}
+	if (!(supported & 0x1))
+		return -ENODEV;
 
 out_free:
 	kfree(output.pointer);
-- 
cgit v1.2.3-70-g09d2


From 6ebdf777ba034d2b54c99f28a4b18dabf286d8e5 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 15 Jul 2010 11:44:00 -0400
Subject: [CPUFREQ] Fix PCC driver error path

The PCC cpufreq driver unmaps the mailbox address range if any CPUs fail to
initialise, but doesn't do anything to remove the registered CPUs from the
cpufreq core resulting in failures further down the line. We're better off
simply returning a failure - the cpufreq core will unregister us cleanly if
we end up with no successfully registered CPUs. Tidy up the failure path
and also add a sanity check to ensure that the firmware gives us a realistic
frequency - the core deals badly with that being set to 0.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Cc: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 900702888bf..a36de5bbb62 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -541,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	if (!pcch_virt_addr) {
 		result = -1;
-		goto pcch_null;
+		goto out;
 	}
 
 	result = pcc_get_offset(cpu);
 	if (result) {
 		dprintk("init: PCCP evaluation failed\n");
-		goto free;
+		goto out;
 	}
 
 	policy->max = policy->cpuinfo.max_freq =
@@ -556,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		ioread32(&pcch_hdr->minimum_frequency) * 1000;
 	policy->cur = pcc_get_freq(cpu);
 
+	if (!policy->cur) {
+		dprintk("init: Unable to get current CPU frequency\n");
+		result = -EINVAL;
+		goto out;
+	}
+
 	dprintk("init: policy->max is %d, policy->min is %d\n",
 		policy->max, policy->min);
-
-	return 0;
-free:
-	pcc_clear_mapping();
-	free_percpu(pcc_cpu_info);
-pcch_null:
+out:
 	return result;
 }
 
-- 
cgit v1.2.3-70-g09d2


From c2f4a2c6e08c7635316dfd25ef706e9104384c56 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 8 Jul 2010 17:55:30 +0200
Subject: [CPUFREQ] powernow-k8: Limit Pstate transition latency check

The Pstate transition latency check was added for broken F10h BIOSen
which wrongly contain a value of 0 for transition and bus master
latency. Fam11h and later, however, (will) have similar transition
latency so extend that behavior for them too.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7ec2123838e..3e90cce3dc8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1023,13 +1023,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
 	}
 	if (max_latency == 0) {
 		/*
-		 * Fam 11h always returns 0 as transition latency.
-		 * This is intended and means "very fast". While cpufreq core
-		 * and governors currently can handle that gracefully, better
-		 * set it to 1 to avoid problems in the future.
-		 * For all others it's a BIOS bug.
+		 * Fam 11h and later may return 0 as transition latency. This
+		 * is intended and means "very fast". While cpufreq core and
+		 * governors currently can handle that gracefully, better set it
+		 * to 1 to avoid problems in the future.
 		 */
-		if (boot_cpu_data.x86 != 0x11)
+		if (boot_cpu_data.x86 < 0x11)
 			printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
 				"latency\n");
 		max_latency = 1;
-- 
cgit v1.2.3-70-g09d2


From 298decfbc44e9a4cb7862ae1b7dfc4e1ba3551b9 Mon Sep 17 00:00:00 2001
From: Marti Raudsepp <marti@juffo.org>
Date: Wed, 20 Jan 2010 19:19:33 +0200
Subject: [CPUFREQ] powernow-k8: On load failure, remind the user to enable
 support in BIOS setup

On Wed, 2010-01-20 at 16:56 +0100, Thomas Renninger wrote:
> But most often this happens if people upgrade their CPU and do not
> update their BIOS.
> Or the vendor does not recognise the new CPU even if the BIOS got
> updated.

Maybe some of those people just didn't realize it was disabled in BIOS?
If you tell users that it's a firmware bug then they'll probably just
give up.

> The itself message might be an enhancment, IMO it's not worth a patch.

Why do you think so? I spent an hour on hunting down the BIOS upgrade,
only to find that it didn't improve anything. It was a day later that I
realized that it might be a BIOS option; and the option was literally
the _last_ option in the whole BIOS setup. :)

This message would have saved the day.

> But do not revert the FW_BUG part!

Sure, you have a point here.

How about this patch?
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3e90cce3dc8..c48b44b3b43 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
 	 * www.amd.com
 	 */
 	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
+	printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
+		" and Cool'N'Quiet support is enabled in BIOS setup\n");
 	return -ENODEV;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6b72e3934b42930fd40fc42fe762d21be413301c Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Tue, 20 Apr 2010 13:17:35 +0200
Subject: [CPUFREQ] acpi-cpufreq: Fix CPU_ANY CPUFREQ_{PRE,POST}CHANGE
 notification

Signed-off-by: Thomas Renninger <trenn@suse.de>
CC: venki@google.com
CC: davej@redhat.com
CC: arjan@infradead.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40e..cee7aa949c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -351,7 +351,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = perf->states[perf->state].core_frequency * 1000;
 	freqs.new = data->freq_table[next_state].frequency;
-	for_each_cpu(i, cmd.mask) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -367,7 +367,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	for_each_cpu(i, cmd.mask) {
+	for_each_cpu(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
-- 
cgit v1.2.3-70-g09d2


From 6f4f2723d08534fd4e407e1ef8500b0f4d12c30c Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Tue, 20 Apr 2010 13:17:36 +0200
Subject: [CPUFREQ] x86 cpufreq: Make trace_power_frequency cpufreq driver
 independent

and fix the broken case if a core's frequency depends on others.

trace_power_frequency was only implemented in a rather ungeneric way
in acpi-cpufreq driver's target() function only.
-> Move the call to trace_power_frequency to
   cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE
   notifier is triggered.
   This will support power frequency tracing by all cpufreq drivers

trace_power_frequency did not trace frequency changes correctly when
the userspace governor was used or when CPU cores' frequency depend
on each other.
-> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu
   which gets switched automatically fixes this.

Robert Schoene provided some important fixes on top of my initial
quick shot version which are integrated in this patch:
- Forgot some changes in power_end trace (TP_printk/variable names)
- Variable dummy in power_end must now be cpu_id
- Use static 64 bit variable instead of unsigned int for cpu_id

Signed-off-by: Thomas Renninger <trenn@suse.de>
CC: davej@redhat.com
CC: arjan@infradead.org
CC: linux-kernel@vger.kernel.org
CC: robert.schoene@tu-dresden.de
Tested-by: robert.schoene@tu-dresden.de
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  3 ---
 arch/x86/kernel/process.c                  |  8 ++++----
 drivers/cpufreq/cpufreq.c                  |  5 +++++
 drivers/cpuidle/cpuidle.c                  |  2 +-
 include/trace/events/power.h               | 27 +++++++++++++++------------
 tools/perf/builtin-timechart.c             | 11 ++++++-----
 6 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index cee7aa949c3..246cd3afbb5 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
 #include <linux/compiler.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
-#include <trace/events/power.h>
 
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32..787572d43d9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -371,7 +371,7 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	trace_power_start(POWER_CSTATE, (ax>>4)+1);
+	trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
 	if (!need_resched()) {
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
@@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 static void mwait_idle(void)
 {
 	if (!need_resched()) {
-		trace_power_start(POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -478,7 +478,7 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	trace_power_start(POWER_CSTATE, 0);
+	trace_power_start(POWER_CSTATE, 0, smp_processor_id());
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 40877d21908..6ce1bb73563 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -29,6 +29,8 @@
 #include <linux/completion.h>
 #include <linux/mutex.h>
 
+#include <trace/events/power.h>
+
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
 						"cpufreq-core", msg)
 
@@ -350,6 +352,9 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 
 	case CPUFREQ_POSTCHANGE:
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
+		dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
+			(unsigned long)freqs->cpu);
+		trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
 		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
 				CPUFREQ_POSTCHANGE, freqs);
 		if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 199488576a0..dbefe15bd58 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -95,7 +95,7 @@ static void cpuidle_idle_call(void)
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev);
-	trace_power_end(0);
+	trace_power_end(smp_processor_id());
 }
 
 /**
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index c4efe9b8280..35a2a6e7bf1 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -18,52 +18,55 @@ enum {
 
 DECLARE_EVENT_CLASS(power,
 
-	TP_PROTO(unsigned int type, unsigned int state),
+	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
 
-	TP_ARGS(type, state),
+	TP_ARGS(type, state, cpu_id),
 
 	TP_STRUCT__entry(
 		__field(	u64,		type		)
 		__field(	u64,		state		)
+		__field(	u64,		cpu_id		)
 	),
 
 	TP_fast_assign(
 		__entry->type = type;
 		__entry->state = state;
+		__entry->cpu_id = cpu_id;
 	),
 
-	TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state)
+	TP_printk("type=%lu state=%lu cpu_id=%lu", (unsigned long)__entry->type,
+		(unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
 );
 
 DEFINE_EVENT(power, power_start,
 
-	TP_PROTO(unsigned int type, unsigned int state),
+	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
 
-	TP_ARGS(type, state)
+	TP_ARGS(type, state, cpu_id)
 );
 
 DEFINE_EVENT(power, power_frequency,
 
-	TP_PROTO(unsigned int type, unsigned int state),
+	TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
 
-	TP_ARGS(type, state)
+	TP_ARGS(type, state, cpu_id)
 );
 
 TRACE_EVENT(power_end,
 
-	TP_PROTO(int dummy),
+	TP_PROTO(unsigned int cpu_id),
 
-	TP_ARGS(dummy),
+	TP_ARGS(cpu_id),
 
 	TP_STRUCT__entry(
-		__field(	u64,		dummy		)
+		__field(	u64,		cpu_id		)
 	),
 
 	TP_fast_assign(
-		__entry->dummy = 0xffff;
+		__entry->cpu_id = cpu_id;
 	),
 
-	TP_printk("dummy=%lu", (unsigned long)__entry->dummy)
+	TP_printk("cpu_id=%lu", (unsigned long)__entry->cpu_id)
 
 );
 
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 5a52ed9fc10..5161619d471 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -300,8 +300,9 @@ struct trace_entry {
 
 struct power_entry {
 	struct trace_entry te;
-	s64	type;
-	s64	value;
+	u64	type;
+	u64	value;
+	u64	cpu_id;
 };
 
 #define TASK_COMM_LEN 16
@@ -498,13 +499,13 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 			return 0;
 
 		if (strcmp(event_str, "power:power_start") == 0)
-			c_state_start(data.cpu, data.time, pe->value);
+			c_state_start(pe->cpu_id, data.time, pe->value);
 
 		if (strcmp(event_str, "power:power_end") == 0)
-			c_state_end(data.cpu, data.time);
+			c_state_end(pe->cpu_id, data.time);
 
 		if (strcmp(event_str, "power:power_frequency") == 0)
-			p_state_change(data.cpu, data.time, pe->value);
+			p_state_change(pe->cpu_id, data.time, pe->value);
 
 		if (strcmp(event_str, "sched:sched_wakeup") == 0)
 			sched_wakeup(data.cpu, data.time, data.pid, te);
-- 
cgit v1.2.3-70-g09d2


From ccc5638a20b0eb3a66666d9d4dd8fe8f5ad40386 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 3 Jul 2010 20:03:55 +0400
Subject: [CPUFREQ] arch/x86/kernel/cpu/cpufreq: use for_each_pci_dev()

Use for_each_pci_dev() to simplify the code.

Signed-off-by: Kulikov Vasiliy <segooon@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e..8c3325fee77 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -199,7 +199,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
 	}
 
 	/* detect which companion chip is used */
-	while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
+	for_each_pci_dev(gx_pci) {
 		if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
 			return gx_pci;
 	}
-- 
cgit v1.2.3-70-g09d2


From 55c789bb2bcdcaa8f1f60687b4a9dbd02ffddd88 Mon Sep 17 00:00:00 2001
From: Peter Huewe <peterhuewe@gmx.de>
Date: Thu, 15 Jul 2010 20:36:41 +0200
Subject: [CPUFREQ] Convert pci_table entries to PCI_VDEVICE (if PCI_ANY_ID is
 used)

This patch converts pci_table entries, where .subvendor=PCI_ANY_ID and
.subdevice=PCI_ANY_ID, .class=0 and .class_mask=0, to use the
PCI_VDEVICE macro, and thus improves readability.

Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 8c3325fee77..32974cf8423 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
  *	Low Level chipset interface				*
  ****************************************************************/
 static struct pci_device_id gx_chipset_tbl[] __initdata = {
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
-		PCI_ANY_ID, PCI_ANY_ID },
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
-		PCI_ANY_ID, PCI_ANY_ID },
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
-		PCI_ANY_ID, PCI_ANY_ID },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
 	{ 0, },
 };
 
-- 
cgit v1.2.3-70-g09d2


From b30d3304c9c068ccfe6940232834768af75f8c9a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 8 Jul 2010 18:05:14 +0200
Subject: [CPUFREQ] powernow-k8: Fix misleading variable naming

rdmsr() takes the lower 32 bits as a second argument and the high 32 as
a third. Fix the names accordingly since they were swapped.

There should be no functionality change resulting from this patch.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c48b44b3b43..90cab2d4ac0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -912,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 {
 	int i;
 	u32 hi = 0, lo = 0;
-	rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
-	data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+	rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+	data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
 
 	for (i = 0; i < data->acpi_data.state_count; i++) {
 		u32 index;
-- 
cgit v1.2.3-70-g09d2


From 7e2d81122052c83feeddbebf706b6d53fba7996d Mon Sep 17 00:00:00 2001
From: Holger Freyther <zecke@selfish.org>
Date: Mon, 19 Jul 2010 03:28:49 +0800
Subject: [CPUFREQ] Fix section mismatch for longrun_cpu_init.

Use __cpuinit instead of __init for the cpufreq_driver
init function like it is done in powernow-k8.c.

This is removing the warning generated when compiling with
the CONFIG_DEBUG_SECTION_MISMATCH=y option.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/longrun.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c5..fc09f142d94 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
  * TMTA rules:
  * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
  */
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
-						   unsigned int *high_freq)
+static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+						      unsigned int *high_freq)
 {
 	u32 msr_lo, msr_hi;
 	u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
 }
 
 
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
 {
 	int result = 0;
 
-- 
cgit v1.2.3-70-g09d2


From 2530573e45c5846cd238db78651f0d236fc78aab Mon Sep 17 00:00:00 2001
From: Holger Freyther <zecke@selfish.org>
Date: Mon, 19 Jul 2010 03:29:03 +0800
Subject: [CPUFREQ] Fix section mismatch for longhaul_cpu_init.

Use __cpuinit instead of __init for the cpufreq_driver
init function like it is done in powernow-k8.c. Use the
__cpuinitdata for data used by the routines marked as __cpuinit.

This is removing the warning generated when compiling with
the CONFIG_DEBUG_SECTION_MISMATCH=y option.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/longhaul.c |  6 +++---
 arch/x86/kernel/cpu/cpufreq/longhaul.h | 26 +++++++++++++-------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f826..03162dac627 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
 }
 
 
-static int __init longhaul_get_ranges(void)
+static int __cpuinit longhaul_get_ranges(void)
 {
 	unsigned int i, j, k = 0;
 	unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
 }
 
 
-static void __init longhaul_setup_voltagescaling(void)
+static void __cpuinit longhaul_setup_voltagescaling(void)
 {
 	union msr_longhaul longhaul;
 	struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
 	return 0;
 }
 
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f7..cbf48fbca88 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
 /*
  * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
  */
-static const int __initdata samuel1_mults[16] = {
+static const int __cpuinitdata samuel1_mults[16] = {
 	-1, /* 0000 -> RESERVED */
 	30, /* 0001 ->  3.0x */
 	40, /* 0010 ->  4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
 	-1, /* 1111 -> RESERVED */
 };
 
-static const int __initdata samuel1_eblcr[16] = {
+static const int __cpuinitdata samuel1_eblcr[16] = {
 	50, /* 0000 -> RESERVED */
 	30, /* 0001 ->  3.0x */
 	40, /* 0010 ->  4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
 /*
  * VIA C3 Samuel2 Stepping 1->15
  */
-static const int __initdata samuel2_eblcr[16] = {
+static const int __cpuinitdata samuel2_eblcr[16] = {
 	50,  /* 0000 ->  5.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
 /*
  * VIA C3 Ezra
  */
-static const int __initdata ezra_mults[16] = {
+static const int __cpuinitdata ezra_mults[16] = {
 	100, /* 0000 -> 10.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
 	120, /* 1111 -> 12.0x */
 };
 
-static const int __initdata ezra_eblcr[16] = {
+static const int __cpuinitdata ezra_eblcr[16] = {
 	50,  /* 0000 ->  5.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
 /*
  * VIA C3 (Ezra-T) [C5M].
  */
-static const int __initdata ezrat_mults[32] = {
+static const int __cpuinitdata ezrat_mults[32] = {
 	100, /* 0000 -> 10.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
 	-1,  /* 1111 -> RESERVED (12.0x) */
 };
 
-static const int __initdata ezrat_eblcr[32] = {
+static const int __cpuinitdata ezrat_eblcr[32] = {
 	50,  /* 0000 ->  5.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
 /*
  * VIA C3 Nehemiah */
 
-static const int __initdata  nehemiah_mults[32] = {
+static const int __cpuinitdata nehemiah_mults[32] = {
 	100, /* 0000 -> 10.0x */
 	-1, /* 0001 -> 16.0x */
 	40,  /* 0010 ->  4.0x */
@@ -270,7 +270,7 @@ static const int __initdata  nehemiah_mults[32] = {
 	-1, /* 1111 -> 12.0x */
 };
 
-static const int __initdata nehemiah_eblcr[32] = {
+static const int __cpuinitdata nehemiah_eblcr[32] = {
 	50,  /* 0000 ->  5.0x */
 	160, /* 0001 -> 16.0x */
 	40,  /* 0010 ->  4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
 	unsigned short pos;
 };
 
-static const struct mV_pos __initdata vrm85_mV[32] = {
+static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
 	{1250, 8},	{1200, 6},	{1150, 4},	{1100, 2},
 	{1050, 0},	{1800, 30},	{1750, 28},	{1700, 26},
 	{1650, 24},	{1600, 22},	{1550, 20},	{1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
 	{1475, 17},	{1425, 15},	{1375, 13},	{1325, 11}
 };
 
-static const unsigned char __initdata mV_vrm85[32] = {
+static const unsigned char __cpuinitdata mV_vrm85[32] = {
 	0x04,	0x14,	0x03,	0x13,	0x02,	0x12,	0x01,	0x11,
 	0x00,	0x10,	0x0f,	0x1f,	0x0e,	0x1e,	0x0d,	0x1d,
 	0x0c,	0x1c,	0x0b,	0x1b,	0x0a,	0x1a,	0x09,	0x19,
 	0x08,	0x18,	0x07,	0x17,	0x06,	0x16,	0x05,	0x15
 };
 
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
+static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
 	{1750, 31},	{1700, 30},	{1650, 29},	{1600, 28},
 	{1550, 27},	{1500, 26},	{1450, 25},	{1400, 24},
 	{1350, 23},	{1300, 22},	{1250, 21},	{1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
 	{675, 3},	{650, 2},	{625, 1},	{600, 0}
 };
 
-static const unsigned char __initdata mV_mobilevrm[32] = {
+static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
 	0x1f,	0x1e,	0x1d,	0x1c,	0x1b,	0x1a,	0x19,	0x18,
 	0x17,	0x16,	0x15,	0x14,	0x13,	0x12,	0x11,	0x10,
 	0x0f,	0x0e,	0x0d,	0x0c,	0x0b,	0x0a,	0x09,	0x08,
-- 
cgit v1.2.3-70-g09d2


From 307069cf6c53632adc27de4f49bf5d1d67cb87bb Mon Sep 17 00:00:00 2001
From: Holger Freyther <zecke@selfish.org>
Date: Mon, 19 Jul 2010 03:29:16 +0800
Subject: [CPUFREQ] Fix section mismatch for powernow_cpu_init in powernow-k7.c

Use __cpuinit instead of __init for the cpufreq_driver
init function like it is done in powernow-k8.c.

This is removing the warning generated when compiling with
the CONFIG_DEBUG_SECTION_MISMATCH=y option.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e..4a45fd6e41b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
  * We will then get the same kind of behaviour already tested under
  * the "well-known" other OS.
  */
-static int __init fixup_sgtc(void)
+static int __cpuinit fixup_sgtc(void)
 {
 	unsigned int sgtc;
 	unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
 }
 
 
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
+static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
 {
 	printk(KERN_WARNING PFX
 		"%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
  * A BIOS update is all that can save them.
  * Mention this, and disable cpufreq.
  */
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
+static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
 	{
 		.callback = acer_cpufreq_pst,
 		.ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
 	{ }
 };
 
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
 {
 	union msr_fidvidstatus fidvidstatus;
 	int result;
-- 
cgit v1.2.3-70-g09d2


From 9d1f44ee206a23b975d7d7c6f759efb25e0e61ac Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 3 Aug 2010 13:47:30 -0400
Subject: [CPUFREQ] Remove pointless printk from p4-clockmod.

The only machines this is triggering on should be supported by
acpi-cpufreq or acpi's internal throttling.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b0..bd1cac747f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 		}
 	}
 
-	if (c->x86 != 0xF) {
-		if (!cpu_has(c, X86_FEATURE_EST))
-			printk(KERN_WARNING PFX "Unknown CPU. "
-				"Please send an e-mail to "
-				"<cpufreq@vger.kernel.org>\n");
+	if (c->x86 != 0xF)
 		return 0;
-	}
 
 	/* on P-4s, the TSC runs with constant frequency independent whether
 	 * throttling is active or not. */
-- 
cgit v1.2.3-70-g09d2


From 55d435a227bd28c77afab326de44dfacc0b15059 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:44 -0700
Subject: x86, hwmon: Package Level Thermal/Power: thermal throttling handler

Add package level thermal throttle interrupt support. The interrupt handler
increases package level thermal throttle count. It also logs the event in MCE
log.

The package level thermal throttle interrupt happens across threads in a
package. Each thread handles the interrupt individually. User level application
is supposed to retrieve correct event count and log based on package/thread
topology. This is the same situation for core level interrupt handler. In the
future, interrupt may be reported only per package or per core.

core_throttle_count and package_throttle_count are used for user interface.
Previously only throttle_count is used for core throttle count. If you think
new core_throttle_count name breaks user interface, I can change this part.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-4-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 89 +++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf971..d307f9f64c2 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -37,7 +37,7 @@
 /*
  * Current thermal throttling state:
  */
-struct thermal_state {
+struct _thermal_state {
 	bool			is_throttled;
 
 	u64			next_check;
@@ -45,6 +45,11 @@ struct thermal_state {
 	unsigned long		last_throttle_count;
 };
 
+struct thermal_state {
+	struct _thermal_state core;
+	struct _thermal_state package;
+};
+
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en	= ATOMIC_INIT(0);
@@ -53,11 +58,13 @@ static u32 lvtthmr_init __read_mostly;
 
 #ifdef CONFIG_SYSFS
 #define define_therm_throt_sysdev_one_ro(_name)				\
-	static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+	static SYSDEV_ATTR(_name, 0444,					\
+			   therm_throt_sysdev_show_##_name,		\
+				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(name)			\
+#define define_therm_throt_sysdev_show_func(level, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##name(				\
+static ssize_t therm_throt_sysdev_show_##level##_##name(		\
 			struct sys_device *dev,				\
 			struct sysdev_attribute *attr,			\
 			char *buf)					\
@@ -66,21 +73,24 @@ static ssize_t therm_throt_sysdev_show_##name(				\
 	ssize_t ret;							\
 									\
 	preempt_disable();	/* CPU hotplug */			\
-	if (cpu_online(cpu))						\
+	if (cpu_online(cpu)) {						\
 		ret = sprintf(buf, "%lu\n",				\
-			      per_cpu(thermal_state, cpu).name);	\
-	else								\
+			      per_cpu(thermal_state, cpu).level.name);	\
+	} else								\
 		ret = 0;						\
 	preempt_enable();						\
 									\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core, throttle_count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(package, throttle_count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
 
 static struct attribute *thermal_throttle_attrs[] = {
-	&attr_throttle_count.attr,
+	&attr_core_throttle_count.attr,
 	NULL
 };
 
@@ -106,16 +116,21 @@ static struct attribute_group thermal_throttle_attr_group = {
  *          1 : Event should be logged further, and a message has been
  *              printed to the syslog.
  */
-static int therm_throt_process(bool is_throttled)
+#define CORE_LEVEL	0
+#define PACKAGE_LEVEL	1
+static int therm_throt_process(bool is_throttled, int level)
 {
-	struct thermal_state *state;
+	struct _thermal_state *state;
 	unsigned int this_cpu;
 	bool was_throttled;
 	u64 now;
 
 	this_cpu = smp_processor_id();
 	now = get_jiffies_64();
-	state = &per_cpu(thermal_state, this_cpu);
+	if (level == CORE_LEVEL)
+		state = &per_cpu(thermal_state, this_cpu).core;
+	else
+		state = &per_cpu(thermal_state, this_cpu).package;
 
 	was_throttled = state->is_throttled;
 	state->is_throttled = is_throttled;
@@ -132,13 +147,18 @@ static int therm_throt_process(bool is_throttled)
 
 	/* if we just entered the thermal event */
 	if (is_throttled) {
-		printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+		printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+		      this_cpu,
+		      level == CORE_LEVEL ? "Core" : "Package",
+		      state->throttle_count);
 
 		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
 	if (was_throttled) {
-		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+		printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+		       this_cpu,
+		       level == CORE_LEVEL ? "Core" : "Package");
 		return 1;
 	}
 
@@ -149,8 +169,19 @@ static int therm_throt_process(bool is_throttled)
 /* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 {
-	return sysfs_create_group(&sys_dev->kobj,
-				  &thermal_throttle_attr_group);
+	int err;
+	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+	err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	if (err)
+		return err;
+
+	if (cpu_has(c, X86_FEATURE_PTS))
+		err = sysfs_add_file_to_group(&sys_dev->kobj,
+					      &attr_package_throttle_count.attr,
+					      thermal_throttle_attr_group.name);
+
+	return err;
 }
 
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -230,10 +261,25 @@ device_initcall(thermal_throttle_init_device);
 static void intel_thermal_interrupt(void)
 {
 	__u64 msr_val;
+	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-	if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
+	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+				CORE_LEVEL) != 0)
 		mce_log_therm_throt_event(msr_val);
+
+	if (cpu_has(c, X86_FEATURE_PTS)) {
+		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+					PACKAGE_LEVEL) != 0)
+			/*
+			 * Set up the most significant bit to notify mce log
+			 * that this thermal event is a package level event.
+			 * This is a temp solution. May be changed in the future
+			 * with mce log infrasture.
+			 */
+			mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
+	}
 }
 
 static void unexpected_thermal_interrupt(void)
@@ -338,6 +384,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	wrmsr(MSR_IA32_THERM_INTERRUPT,
 		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
 
+	if (cpu_has(c, X86_FEATURE_PTS)) {
+		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			l | (PACKAGE_THERM_INT_LOW_ENABLE
+		  | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+	}
+
 	smp_thermal_vector = intel_thermal_interrupt;
 
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-- 
cgit v1.2.3-70-g09d2


From 0199114c31798af5b83841b21759b64171060d9b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 29 Jul 2010 17:13:45 -0700
Subject: x86, hwmon: Package Level Thermal/Power: power limit

Power limit notification feature is published in Intel 64 and IA-32
Architectures SDMV Vol 3A 14.5.6 Power Limit Notification.

It is implemented first on Intel Sandy Bridge platform.

The patch handles notification interrupt. Interrupt handler dumps power limit
information in log_buf, logs the event in mce log, and increases the event
counters (core_power_limit and package_power_limit). Upper level applications
could use the data to detect system health or diagnose functionality/performance
issues.

In the future, the event could be handled in a more fancy way.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1280448826-12004-5-git-send-email-fenghua.yu@intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 183 ++++++++++++++++++++++---------
 1 file changed, 129 insertions(+), 54 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d307f9f64c2..c2a8b26d4fe 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,25 @@
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
 
+#define THERMAL_THROTTLING_EVENT	0
+#define POWER_LIMIT_EVENT		1
+
 /*
- * Current thermal throttling state:
+ * Current thermal event state:
  */
 struct _thermal_state {
-	bool			is_throttled;
-
+	bool			new_event;
+	int			event;
 	u64			next_check;
-	unsigned long		throttle_count;
-	unsigned long		last_throttle_count;
+	unsigned long		count;
+	unsigned long		last_count;
 };
 
 struct thermal_state {
-	struct _thermal_state core;
-	struct _thermal_state package;
+	struct _thermal_state core_throttle;
+	struct _thermal_state core_power_limit;
+	struct _thermal_state package_throttle;
+	struct _thermal_state package_power_limit;
 };
 
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly;
 			   therm_throt_sysdev_show_##_name,		\
 				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(level, name)		\
+#define define_therm_throt_sysdev_show_func(event, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##level##_##name(		\
+static ssize_t therm_throt_sysdev_show_##event##_##name(		\
 			struct sys_device *dev,				\
 			struct sysdev_attribute *attr,			\
 			char *buf)					\
@@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name(		\
 	preempt_disable();	/* CPU hotplug */			\
 	if (cpu_online(cpu)) {						\
 		ret = sprintf(buf, "%lu\n",				\
-			      per_cpu(thermal_state, cpu).level.name);	\
+			      per_cpu(thermal_state, cpu).event.name);	\
 	} else								\
 		ret = 0;						\
 	preempt_enable();						\
@@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name(		\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(core, throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
 define_therm_throt_sysdev_one_ro(core_throttle_count);
 
-define_therm_throt_sysdev_show_func(package, throttle_count);
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
 define_therm_throt_sysdev_one_ro(package_throttle_count);
 
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
+
 static struct attribute *thermal_throttle_attrs[] = {
 	&attr_core_throttle_count.attr,
 	NULL
 };
 
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
 	.attrs	= thermal_throttle_attrs,
 	.name	= "thermal_throttle"
 };
 #endif /* CONFIG_SYSFS */
 
+#define CORE_LEVEL	0
+#define PACKAGE_LEVEL	1
+
 /***
  * therm_throt_process - Process thermal throttling event from interrupt
  * @curr: Whether the condition is current or not (boolean), since the
@@ -116,49 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
  *          1 : Event should be logged further, and a message has been
  *              printed to the syslog.
  */
-#define CORE_LEVEL	0
-#define PACKAGE_LEVEL	1
-static int therm_throt_process(bool is_throttled, int level)
+static int therm_throt_process(bool new_event, int event, int level)
 {
 	struct _thermal_state *state;
-	unsigned int this_cpu;
-	bool was_throttled;
+	unsigned int this_cpu = smp_processor_id();
+	bool old_event;
 	u64 now;
+	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
 
-	this_cpu = smp_processor_id();
 	now = get_jiffies_64();
-	if (level == CORE_LEVEL)
-		state = &per_cpu(thermal_state, this_cpu).core;
-	else
-		state = &per_cpu(thermal_state, this_cpu).package;
+	if (level == CORE_LEVEL) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			state = &pstate->core_throttle;
+		else if (event == POWER_LIMIT_EVENT)
+			state = &pstate->core_power_limit;
+		else
+			 return 0;
+	} else if (level == PACKAGE_LEVEL) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			state = &pstate->package_throttle;
+		else if (event == POWER_LIMIT_EVENT)
+			state = &pstate->package_power_limit;
+		else
+			return 0;
+	} else
+		return 0;
 
-	was_throttled = state->is_throttled;
-	state->is_throttled = is_throttled;
+	old_event = state->new_event;
+	state->new_event = new_event;
 
-	if (is_throttled)
-		state->throttle_count++;
+	if (new_event)
+		state->count++;
 
 	if (time_before64(now, state->next_check) &&
-			state->throttle_count != state->last_throttle_count)
+			state->count != state->last_count)
 		return 0;
 
 	state->next_check = now + CHECK_INTERVAL;
-	state->last_throttle_count = state->throttle_count;
+	state->last_count = state->count;
 
 	/* if we just entered the thermal event */
-	if (is_throttled) {
-		printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
-		      this_cpu,
-		      level == CORE_LEVEL ? "Core" : "Package",
-		      state->throttle_count);
+	if (new_event) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package",
+				state->count);
+		else
+			printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package",
+				state->count);
 
 		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
-	if (was_throttled) {
-		printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-		       this_cpu,
-		       level == CORE_LEVEL ? "Core" : "Package");
+	if (old_event) {
+		if (event == THERMAL_THROTTLING_EVENT)
+			printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package");
+		else
+			printk(KERN_INFO "CPU%d: %s power limit normal\n",
+				this_cpu,
+				level == CORE_LEVEL ? "Core" : "Package");
 		return 1;
 	}
 
@@ -172,21 +207,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 	int err;
 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
-	err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
 	if (err)
 		return err;
 
+	if (cpu_has(c, X86_FEATURE_PLN))
+		err = sysfs_add_file_to_group(&sys_dev->kobj,
+					      &attr_core_power_limit_count.attr,
+					      thermal_attr_group.name);
 	if (cpu_has(c, X86_FEATURE_PTS))
 		err = sysfs_add_file_to_group(&sys_dev->kobj,
 					      &attr_package_throttle_count.attr,
-					      thermal_throttle_attr_group.name);
+					      thermal_attr_group.name);
+		if (cpu_has(c, X86_FEATURE_PLN))
+			err = sysfs_add_file_to_group(&sys_dev->kobj,
+					&attr_package_power_limit_count.attr,
+					thermal_attr_group.name);
 
 	return err;
 }
 
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
 {
-	sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
 }
 
 /* Mutex protecting device creation against CPU hotplug: */
@@ -257,6 +300,17 @@ device_initcall(thermal_throttle_init_device);
 
 #endif /* CONFIG_SYSFS */
 
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED		(0)
+#define CORE_POWER_LIMIT	((__u64)1 << 62)
+#define PACKAGE_THROTTLED	((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT	((__u64)3 << 62)
+
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
@@ -264,21 +318,31 @@ static void intel_thermal_interrupt(void)
 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+
 	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+				THERMAL_THROTTLING_EVENT,
 				CORE_LEVEL) != 0)
-		mce_log_therm_throt_event(msr_val);
+		mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+	if (cpu_has(c, X86_FEATURE_PLN))
+		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+					POWER_LIMIT_EVENT,
+					CORE_LEVEL) != 0)
+			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
 
 	if (cpu_has(c, X86_FEATURE_PTS)) {
 		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+					THERMAL_THROTTLING_EVENT,
 					PACKAGE_LEVEL) != 0)
-			/*
-			 * Set up the most significant bit to notify mce log
-			 * that this thermal event is a package level event.
-			 * This is a temp solution. May be changed in the future
-			 * with mce log infrasture.
-			 */
-			mce_log_therm_throt_event(((__u64)1 << 63) | msr_val);
+			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+		if (cpu_has(c, X86_FEATURE_PLN))
+			if (therm_throt_process(msr_val &
+					PACKAGE_THERM_STATUS_POWER_LIMIT,
+					POWER_LIMIT_EVENT,
+					PACKAGE_LEVEL) != 0)
+				mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+							  | msr_val);
 	}
 }
 
@@ -381,14 +445,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr(MSR_IA32_THERM_INTERRUPT,
-		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+	if (cpu_has(c, X86_FEATURE_PLN))
+		wrmsr(MSR_IA32_THERM_INTERRUPT,
+		      l | (THERM_INT_LOW_ENABLE
+			| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+	else
+		wrmsr(MSR_IA32_THERM_INTERRUPT,
+		      l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
 
 	if (cpu_has(c, X86_FEATURE_PTS)) {
 		rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
-			l | (PACKAGE_THERM_INT_LOW_ENABLE
-		  | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+		if (cpu_has(c, X86_FEATURE_PLN))
+			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			      l | (PACKAGE_THERM_INT_LOW_ENABLE
+				| PACKAGE_THERM_INT_HIGH_ENABLE
+				| PACKAGE_THERM_INT_PLN_ENABLE), h);
+		else
+			wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+			      l | (PACKAGE_THERM_INT_LOW_ENABLE
+				| PACKAGE_THERM_INT_HIGH_ENABLE), h);
 	}
 
 	smp_thermal_vector = intel_thermal_interrupt;
-- 
cgit v1.2.3-70-g09d2


From 12bfa3de63504d879ae427ec1f2884fc46556157 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:20 -0500
Subject: kgdb,x86: Individual register get/set for x86

Implement the ability to individually get and set registers for kdb
and kgdb for x86.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: x86@kernel.org
---
 arch/x86/include/asm/kgdb.h |  20 +++---
 arch/x86/kernel/kgdb.c      | 168 ++++++++++++++++++++++----------------------
 2 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cd..396f5b5fc4d 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
 	GDB_FS,			/* 14 */
 	GDB_GS,			/* 15 */
 };
+#define GDB_ORIG_AX		41
+#define DBG_MAX_REG_NUM		16
 #define NUMREGBYTES		((GDB_GS+1)*4)
 #else /* ! CONFIG_X86_32 */
-enum regnames64 {
+enum regnames {
 	GDB_AX,			/* 0 */
 	GDB_BX,			/* 1 */
 	GDB_CX,			/* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
 	GDB_R14,		/* 14 */
 	GDB_R15,		/* 15 */
 	GDB_PC,			/* 16 */
+	GDB_PS,			/* 17 */
+	GDB_CS,			/* 18 */
+	GDB_SS,			/* 19 */
 };
-
-enum regnames32 {
-	GDB_PS = 34,
-	GDB_CS,
-	GDB_SS,
-};
-#define NUMREGBYTES		((GDB_SS+1)*4)
-#endif /* CONFIG_X86_32 */
+#define GDB_ORIG_AX		57
+#define DBG_MAX_REG_NUM		20
+/* 17 64 bit regs and 3 32 bit regs */
+#define NUMREGBYTES		((17 * 8) + (3 * 4))
+#endif /* ! CONFIG_X86_32 */
 
 static inline void arch_kgdb_breakpoint(void)
 {
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae..bae89825e14 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
 #include <asm/system.h>
 #include <asm/apic.h>
 
-/**
- *	pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
- *	@regs: The &struct pt_regs of the current process.
- *
- *	Convert the pt_regs in @regs into the format for registers that
- *	GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 {
-#ifndef CONFIG_X86_32
-	u32 *gdb_regs32 = (u32 *)gdb_regs;
+#ifdef CONFIG_X86_32
+	{ "ax", 4, offsetof(struct pt_regs, ax) },
+	{ "cx", 4, offsetof(struct pt_regs, cx) },
+	{ "dx", 4, offsetof(struct pt_regs, dx) },
+	{ "bx", 4, offsetof(struct pt_regs, bx) },
+	{ "sp", 4, offsetof(struct pt_regs, sp) },
+	{ "bp", 4, offsetof(struct pt_regs, bp) },
+	{ "si", 4, offsetof(struct pt_regs, si) },
+	{ "di", 4, offsetof(struct pt_regs, di) },
+	{ "ip", 4, offsetof(struct pt_regs, ip) },
+	{ "flags", 4, offsetof(struct pt_regs, flags) },
+	{ "cs", 4, offsetof(struct pt_regs, cs) },
+	{ "ss", 4, offsetof(struct pt_regs, ss) },
+	{ "ds", 4, offsetof(struct pt_regs, ds) },
+	{ "es", 4, offsetof(struct pt_regs, es) },
+	{ "fs", 4, -1 },
+	{ "gs", 4, -1 },
+#else
+	{ "ax", 8, offsetof(struct pt_regs, ax) },
+	{ "bx", 8, offsetof(struct pt_regs, bx) },
+	{ "cx", 8, offsetof(struct pt_regs, cx) },
+	{ "dx", 8, offsetof(struct pt_regs, dx) },
+	{ "si", 8, offsetof(struct pt_regs, dx) },
+	{ "di", 8, offsetof(struct pt_regs, di) },
+	{ "bp", 8, offsetof(struct pt_regs, bp) },
+	{ "sp", 8, offsetof(struct pt_regs, sp) },
+	{ "r8", 8, offsetof(struct pt_regs, r8) },
+	{ "r9", 8, offsetof(struct pt_regs, r9) },
+	{ "r10", 8, offsetof(struct pt_regs, r10) },
+	{ "r11", 8, offsetof(struct pt_regs, r11) },
+	{ "r12", 8, offsetof(struct pt_regs, r12) },
+	{ "r13", 8, offsetof(struct pt_regs, r13) },
+	{ "r14", 8, offsetof(struct pt_regs, r14) },
+	{ "r15", 8, offsetof(struct pt_regs, r15) },
+	{ "ip", 8, offsetof(struct pt_regs, ip) },
+	{ "flags", 4, offsetof(struct pt_regs, flags) },
+	{ "cs", 4, offsetof(struct pt_regs, cs) },
+	{ "ss", 4, offsetof(struct pt_regs, ss) },
 #endif
-	gdb_regs[GDB_AX]	= regs->ax;
-	gdb_regs[GDB_BX]	= regs->bx;
-	gdb_regs[GDB_CX]	= regs->cx;
-	gdb_regs[GDB_DX]	= regs->dx;
-	gdb_regs[GDB_SI]	= regs->si;
-	gdb_regs[GDB_DI]	= regs->di;
-	gdb_regs[GDB_BP]	= regs->bp;
-	gdb_regs[GDB_PC]	= regs->ip;
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (
 #ifdef CONFIG_X86_32
-	gdb_regs[GDB_PS]	= regs->flags;
-	gdb_regs[GDB_DS]	= regs->ds;
-	gdb_regs[GDB_ES]	= regs->es;
-	gdb_regs[GDB_CS]	= regs->cs;
-	gdb_regs[GDB_FS]	= 0xFFFF;
-	gdb_regs[GDB_GS]	= 0xFFFF;
-	if (user_mode_vm(regs)) {
-		gdb_regs[GDB_SS] = regs->ss;
-		gdb_regs[GDB_SP] = regs->sp;
-	} else {
-		gdb_regs[GDB_SS] = __KERNEL_DS;
-		gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+	    regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+	    regno == GDB_SP || regno == GDB_ORIG_AX)
+		return 0;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno == GDB_ORIG_AX) {
+		memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+		return "orig_ax";
 	}
-#else
-	gdb_regs[GDB_R8]	= regs->r8;
-	gdb_regs[GDB_R9]	= regs->r9;
-	gdb_regs[GDB_R10]	= regs->r10;
-	gdb_regs[GDB_R11]	= regs->r11;
-	gdb_regs[GDB_R12]	= regs->r12;
-	gdb_regs[GDB_R13]	= regs->r13;
-	gdb_regs[GDB_R14]	= regs->r14;
-	gdb_regs[GDB_R15]	= regs->r15;
-	gdb_regs32[GDB_PS]	= regs->flags;
-	gdb_regs32[GDB_CS]	= regs->cs;
-	gdb_regs32[GDB_SS]	= regs->ss;
-	gdb_regs[GDB_SP]	= kernel_stack_pointer(regs);
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+
+	switch (regno) {
+#ifdef CONFIG_X86_32
+	case GDB_SS:
+		if (!user_mode_vm(regs))
+			*(unsigned long *)mem = __KERNEL_DS;
+		break;
+	case GDB_SP:
+		if (!user_mode_vm(regs))
+			*(unsigned long *)mem = kernel_stack_pointer(regs);
+		break;
+	case GDB_GS:
+	case GDB_FS:
+		*(unsigned long *)mem = 0xFFFF;
+		break;
 #endif
+	}
+	return dbg_reg_def[regno].name;
 }
 
 /**
@@ -150,47 +189,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	gdb_regs[GDB_SP]	= p->thread.sp;
 }
 
-/**
- *	gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- *	@gdb_regs: A pointer to hold the registers we've received from GDB.
- *	@regs: A pointer to a &struct pt_regs to hold these values in.
- *
- *	Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- *	in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
-	u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
-	regs->ax		= gdb_regs[GDB_AX];
-	regs->bx		= gdb_regs[GDB_BX];
-	regs->cx		= gdb_regs[GDB_CX];
-	regs->dx		= gdb_regs[GDB_DX];
-	regs->si		= gdb_regs[GDB_SI];
-	regs->di		= gdb_regs[GDB_DI];
-	regs->bp		= gdb_regs[GDB_BP];
-	regs->ip		= gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
-	regs->flags		= gdb_regs[GDB_PS];
-	regs->ds		= gdb_regs[GDB_DS];
-	regs->es		= gdb_regs[GDB_ES];
-	regs->cs		= gdb_regs[GDB_CS];
-#else
-	regs->r8		= gdb_regs[GDB_R8];
-	regs->r9		= gdb_regs[GDB_R9];
-	regs->r10		= gdb_regs[GDB_R10];
-	regs->r11		= gdb_regs[GDB_R11];
-	regs->r12		= gdb_regs[GDB_R12];
-	regs->r13		= gdb_regs[GDB_R13];
-	regs->r14		= gdb_regs[GDB_R14];
-	regs->r15		= gdb_regs[GDB_R15];
-	regs->flags		= gdb_regs32[GDB_PS];
-	regs->cs		= gdb_regs32[GDB_CS];
-	regs->ss		= gdb_regs32[GDB_SS];
-#endif
-}
-
 static struct hw_breakpoint {
 	unsigned		enabled;
 	unsigned long		addr;
-- 
cgit v1.2.3-70-g09d2


From 9264b278be42c031dc76517a0d4bb154f5dcf470 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 5 Aug 2010 09:22:24 -0500
Subject: KGDB: Remove set but unused newPC

Found by gcc 4.6's new warnings

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index bae89825e14..a8b80979ceb 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -456,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 {
 	unsigned long addr;
 	char *ptr;
-	int newPC;
 
 	switch (remcomInBuffer[0]) {
 	case 'c':
@@ -467,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 			linux_regs->ip = addr;
 	case 'D':
 	case 'k':
-		newPC = linux_regs->ip;
-
 		/* clear the trace bit */
 		linux_regs->flags &= ~X86_EFLAGS_TF;
 		atomic_set(&kgdb_cpu_doing_single_step, -1);
-- 
cgit v1.2.3-70-g09d2


From df4939350b345ebb44937902827aa75b8ad4998c Mon Sep 17 00:00:00 2001
From: Dongdong Deng <dongdong.deng@windriver.com>
Date: Thu, 5 Aug 2010 09:22:25 -0500
Subject: kgdb,x86: use macro HBP_NUM to replace magic number 4

Use the macros provided by the HW breakpoint API.

Signed-off-by: Dongdong Deng <dongdong.deng@windriver.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a8b80979ceb..ef10940e1af 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -195,7 +195,7 @@ static struct hw_breakpoint {
 	int			len;
 	int			type;
 	struct perf_event	**pev;
-} breakinfo[4];
+} breakinfo[HBP_NUM];
 
 static unsigned long early_dr7;
 
@@ -203,7 +203,7 @@ static void kgdb_correct_hw_break(void)
 {
 	int breakno;
 
-	for (breakno = 0; breakno < 4; breakno++) {
+	for (breakno = 0; breakno < HBP_NUM; breakno++) {
 		struct perf_event *bp;
 		struct arch_hw_breakpoint *info;
 		int val;
@@ -290,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 {
 	int i;
 
-	for (i = 0; i < 4; i++)
+	for (i = 0; i < HBP_NUM; i++)
 		if (breakinfo[i].addr == addr && breakinfo[i].enabled)
 			break;
-	if (i == 4)
+	if (i == HBP_NUM)
 		return -1;
 
 	if (hw_break_release_slot(i)) {
@@ -311,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
 	int cpu = raw_smp_processor_id();
 	struct perf_event *bp;
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < HBP_NUM; i++) {
 		if (!breakinfo[i].enabled)
 			continue;
 		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -331,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 {
 	int i;
 
-	for (i = 0; i < 4; i++)
+	for (i = 0; i < HBP_NUM; i++)
 		if (!breakinfo[i].enabled)
 			break;
-	if (i == 4)
+	if (i == HBP_NUM)
 		return -1;
 
 	switch (bptype) {
@@ -395,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 
 	/* Disable hardware debugging while we are in kgdb: */
 	set_debugreg(0UL, 7);
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < HBP_NUM; i++) {
 		if (!breakinfo[i].enabled)
 			continue;
 		if (dbg_is_early) {
@@ -640,7 +640,7 @@ void kgdb_arch_late(void)
 	attr.bp_len = HW_BREAKPOINT_LEN_1;
 	attr.bp_type = HW_BREAKPOINT_W;
 	attr.disabled = 1;
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < HBP_NUM; i++) {
 		if (breakinfo[i].pev)
 			continue;
 		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
-- 
cgit v1.2.3-70-g09d2


From 5989cd6a1cbf86587edcc856791f960978087311 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 Aug 2010 13:30:27 -0700
Subject: x86, apic: Map the local apic when parsing the MP table.

This fixes a regression in 2.6.35 from 2.6.34, that is
present for select models of Intel cpus when people are
using an MP table.

The commit cf7500c0ea133d66f8449d86392d83f840102632
"x86, ioapic: In mpparse use mp_register_ioapic" started
calling mp_register_ioapic from MP_ioapic_info.  An extremely
simple change that was obviously correct.  Unfortunately
mp_register_ioapic did just a little more than the previous
hand crafted code and so we gained this call path.

The problem call path is:
MP_ioapic_info()
  mp_register_ioapic()
   io_apic_unique_id()
     io_apic_get_unique_id()
       get_physical_broadcast()
         modern_apic()
           lapic_get_version()
             apic_read(APIC_LVR)

Which turned out to be a problem because the local apic
was not mapped, at that point, unlike the similar point
in the ACPI parsing code.

This problem is fixed by mapping the local apic when
parsing the mptable as soon as we reasonably can.

Looking at the number of places we setup the fixmap for
the local apic, I see some serious simplification opportunities.
For the moment except for not duplicating the setting up of the
fixmap in init_apic_mappings, I have not acted on them.

The regression from 2.6.34 is tracked in bug
https://bugzilla.kernel.org/show_bug.cgi?id=16173

Cc: <stable@kernel.org> 2.6.35
Reported-by: David Hill <hilld@binarystorm.net>
Reported-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Tested-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <m1eiee86jg.fsf_-_@fess.ebiederm.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c |  2 +-
 arch/x86/kernel/mpparse.c   | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6ca..c07e51391a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
 		 * acpi lapic path already maps that address in
 		 * acpi_register_lapic_address()
 		 */
-		if (!acpi_lapic)
+		if (!acpi_lapic && !smp_found_config)
 			set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
 
 		apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54b..d7b6f7fb4fe 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 
 void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
 
+static void __init smp_register_lapic_address(unsigned long address)
+{
+	mp_lapic_addr = address;
+
+	set_fixmap_nocache(FIX_APIC_BASE, address);
+	if (boot_cpu_physical_apicid == -1U) {
+		boot_cpu_physical_apicid  = read_apic_id();
+		apic_version[boot_cpu_physical_apicid] =
+			 GET_APIC_VERSION(apic_read(APIC_LVR));
+	}
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	if (early)
 		return 1;
 
+	/* Initialize the lapic mapping */
+	if (!acpi_lapic)
+		smp_register_lapic_address(mpc->lapic);
+
 	if (mpc->oemptr)
 		x86_init.mpparse.smp_read_mpc_oem(mpc);
 
-- 
cgit v1.2.3-70-g09d2


From ad4ecef2f13c790f95b55320f2925c205d8f971f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 2 Aug 2010 15:48:23 +0800
Subject: ACPI, APEI, Rename CPER and GHES severity constants

The abbreviation of severity should be SEV instead of SER, so the CPER
severity constants are renamed accordingly. GHES severity constants
are renamed in the same way too.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c |  4 ++--
 drivers/acpi/apei/ghes.c              | 32 ++++++++++++++++----------------
 include/linux/cper.h                  |  8 ++++----
 3 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 745b54f9be8..8209472b27a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m)
 	rcd.hdr.revision = CPER_RECORD_REV;
 	rcd.hdr.signature_end = CPER_SIG_END;
 	rcd.hdr.section_count = 1;
-	rcd.hdr.error_severity = CPER_SER_FATAL;
+	rcd.hdr.error_severity = CPER_SEV_FATAL;
 	/* timestamp, platform_id, partition_id are all invalid */
 	rcd.hdr.validation_bits = 0;
 	rcd.hdr.record_length = sizeof(rcd);
@@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m)
 	rcd.sec_hdr.validation_bits = 0;
 	rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
 	rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
-	rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+	rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
 
 	memcpy(&rcd.mce, m, sizeof(*m));
 
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fd0cc016a09..c76aff6c861 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -132,26 +132,26 @@ static void ghes_fini(struct ghes *ghes)
 }
 
 enum {
-	GHES_SER_NO = 0x0,
-	GHES_SER_CORRECTED = 0x1,
-	GHES_SER_RECOVERABLE = 0x2,
-	GHES_SER_PANIC = 0x3,
+	GHES_SEV_NO = 0x0,
+	GHES_SEV_CORRECTED = 0x1,
+	GHES_SEV_RECOVERABLE = 0x2,
+	GHES_SEV_PANIC = 0x3,
 };
 
 static inline int ghes_severity(int severity)
 {
 	switch (severity) {
-	case CPER_SER_INFORMATIONAL:
-		return GHES_SER_NO;
-	case CPER_SER_CORRECTED:
-		return GHES_SER_CORRECTED;
-	case CPER_SER_RECOVERABLE:
-		return GHES_SER_RECOVERABLE;
-	case CPER_SER_FATAL:
-		return GHES_SER_PANIC;
+	case CPER_SEV_INFORMATIONAL:
+		return GHES_SEV_NO;
+	case CPER_SEV_CORRECTED:
+		return GHES_SEV_CORRECTED;
+	case CPER_SEV_RECOVERABLE:
+		return GHES_SEV_RECOVERABLE;
+	case CPER_SEV_FATAL:
+		return GHES_SEV_PANIC;
 	default:
 		/* Unkown, go panic */
-		return GHES_SER_PANIC;
+		return GHES_SEV_PANIC;
 	}
 }
 
@@ -237,16 +237,16 @@ static void ghes_clear_estatus(struct ghes *ghes)
 
 static void ghes_do_proc(struct ghes *ghes)
 {
-	int ser, processed = 0;
+	int sev, processed = 0;
 	struct acpi_hest_generic_data *gdata;
 
-	ser = ghes_severity(ghes->estatus->error_severity);
+	sev = ghes_severity(ghes->estatus->error_severity);
 	apei_estatus_for_each_section(ghes->estatus, gdata) {
 #ifdef CONFIG_X86_MCE
 		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
 				 CPER_SEC_PLATFORM_MEM)) {
 			apei_mce_report_mem_error(
-				ser == GHES_SER_CORRECTED,
+				sev == GHES_SEV_CORRECTED,
 				(struct cper_sec_mem_err *)(gdata+1));
 			processed = 1;
 		}
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 4b38f905b70..bf972f81e2a 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -39,10 +39,10 @@
  * Severity difinition for error_severity in struct cper_record_header
  * and section_severity in struct cper_section_descriptor
  */
-#define CPER_SER_RECOVERABLE			0x0
-#define CPER_SER_FATAL				0x1
-#define CPER_SER_CORRECTED			0x2
-#define CPER_SER_INFORMATIONAL			0x3
+#define CPER_SEV_RECOVERABLE			0x0
+#define CPER_SEV_FATAL				0x1
+#define CPER_SEV_CORRECTED			0x2
+#define CPER_SEV_INFORMATIONAL			0x3
 
 /*
  * Validation bits difinition for validation_bits in struct
-- 
cgit v1.2.3-70-g09d2


From 1c250d709fdc8aa5bf42d90be99428a01a256a55 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 5 Aug 2010 19:09:17 +0400
Subject: perf, x86: P4 PMU -- update nmi irq statistics and unmask lvt entry
 properly

In case if last active performance counter is not overflowed at
moment of NMI being triggered by another counter, the irq
statistics may miss an update stage. As a more serious
consequence -- apic quirk may not be triggered so apic lvt entry
stay masked.

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100805150917.GA6311@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 107711bf0ee..febb12cea79 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -656,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int overflow;
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -666,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		WARN_ON_ONCE(hwc->idx != idx);
 
 		/* it might be unflagged overflow */
-		handled = p4_pmu_clear_cccr_ovf(hwc);
+		overflow = p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+		if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
 			continue;
 
+		handled += overflow;
+
 		/* event overflow for sure */
 		data.period = event->hw.last_period;
 
@@ -687,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-	return handled;
+	return handled > 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From d7a7c573936a86474c4a5090a45a4bc6e680c117 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 9 Aug 2010 17:20:33 -0700
Subject: x86, ia64, smp: use workqueues unconditionally during do_boot_cpu()

Workqueues are now initialized as part of the early_initcall().  So they
are available for use during cold boot process aswell.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/smpboot.c | 15 ++++++---------
 arch/x86/kernel/smpboot.c  |  8 ++------
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 1d85d8cfaa7..d003b502a43 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -508,21 +508,18 @@ do_boot_cpu (int sapicid, int cpu)
 		.done	= COMPLETION_INITIALIZER(c_idle.done),
 	};
 
+	/*
+	 * We can't use kernel_thread since we must avoid to
+	 * reschedule the child.
+	 */
  	c_idle.idle = get_idle_for_cpu(cpu);
  	if (c_idle.idle) {
 		init_idle(c_idle.idle, cpu);
  		goto do_rest;
 	}
 
-	/*
-	 * We can't use kernel_thread since we must avoid to reschedule the child.
-	 */
-	if (!keventd_up())
-		c_idle.work.func(&c_idle.work);
-	else {
-		schedule_work(&c_idle.work);
-		wait_for_completion(&c_idle.done);
-	}
+	schedule_work(&c_idle.work);
+	wait_for_completion(&c_idle.done);
 
 	if (IS_ERR(c_idle.idle))
 		panic("failed fork for CPU %d", cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 51620953b18..a5e928b0cb5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		goto do_rest;
 	}
 
-	if (!keventd_up())
-		c_idle.work.func(&c_idle.work);
-	else {
-		schedule_work(&c_idle.work);
-		wait_for_completion(&c_idle.done);
-	}
+	schedule_work(&c_idle.work);
+	wait_for_completion(&c_idle.done);
 
 	if (IS_ERR(c_idle.idle)) {
 		printk("failed fork for CPU %d\n", cpu);
-- 
cgit v1.2.3-70-g09d2


From 8cbd84f2dd4e52a8771b191030c374ba3e56d291 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 10 Aug 2010 15:35:10 -0700
Subject: x86: fix up system call numbering nit

As pointed out by Jiri Slaby: when I resolved the the 32-bit x85 system
call entry tables for prlimit (due to the conflict with fanotify), I
forgot to add the numbering in comments that we do for every fifth entry.

Reported-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          | 2 +-
 arch/x86/kernel/syscall_table_32.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 91dc4bb1303..b86feabed69 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -844,5 +844,5 @@ ia32_sys_call_table:
 	.quad compat_sys_recvmmsg
 	.quad sys_fanotify_init
 	.quad sys32_fanotify_mark
-	.quad sys_prlimit64
+	.quad sys_prlimit64		/* 340 */
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 4802accb9d8..b35786dc9b8 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -339,4 +339,4 @@ ENTRY(sys_call_table)
 	.long sys_recvmmsg
 	.long sys_fanotify_init
 	.long sys_fanotify_mark
-	.long sys_prlimit64
+	.long sys_prlimit64		/* 340 */
-- 
cgit v1.2.3-70-g09d2


From a3da323420d5aa6f7bd15efc7bf34cd6d19e1f1a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sun, 8 Aug 2010 02:37:23 +0900
Subject: [CPUFREQ] add missing __percpu markup in pcc-cpufreq.c

pcc_cpu_info is a percpu pointer but was missing __percpu markup.
Add it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb62..994230d4dc4 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
 	u32 output_offset;
 };
 
-static struct pcc_cpu *pcc_cpu_info;
+static struct pcc_cpu __percpu *pcc_cpu_info;
 
 static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
 {
-- 
cgit v1.2.3-70-g09d2


From 4936a3b90d79dd8775c6ac23c2cf2dcebe29abde Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 9 Aug 2010 14:20:10 -0700
Subject: x86/hpet: Use the FSEC_PER_SEC constant for femto-second periods

The current computation, introduced with f12a15be63, of FSEC_PER_SEC using
the multiplication of (FSEC_PER_NSEC * NSEC_PER_SEC) is performed only
with 32bit integers on small machines, resulting in an overflow and a
*very* short intervals being programmed.  An interrupt storm follows.

Note that we also have to specify FSEC_PER_SEC as being long long to
overcome the same limitations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c | 4 ++--
 include/linux/time.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 33dbcc4ec5f..351f9c0fea1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -582,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	 * scaled math multiplication factor for nanosecond to hpet tick
 	 * conversion.
 	 */
-	hpet_freq = 1000000000000000ULL;
+	hpet_freq = FSEC_PER_SEC;
 	do_div(hpet_freq, hpet_period);
 	evt->mult = div_sc((unsigned long) hpet_freq,
 				      NSEC_PER_SEC, evt->shift);
@@ -837,7 +837,7 @@ static int hpet_clocksource_register(void)
 	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
 	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
 	 */
-	hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
+	hpet_freq = FSEC_PER_SEC;
 	do_div(hpet_freq, hpet_period);
 	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
 
diff --git a/include/linux/time.h b/include/linux/time.h
index cb34e35faba..12612701b1a 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -38,7 +38,7 @@ extern struct timezone sys_tz;
 #define NSEC_PER_MSEC	1000000L
 #define USEC_PER_SEC	1000000L
 #define NSEC_PER_SEC	1000000000L
-#define FSEC_PER_SEC	1000000000000000L
+#define FSEC_PER_SEC	1000000000000000LL
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
-- 
cgit v1.2.3-70-g09d2


From 3f6c4df7e1f05f2fdb3f20cac9c03f595a72b45d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Fri, 13 Aug 2010 23:00:11 +0900
Subject: [CPUFREQ] acpi-cpufreq: add missing __percpu markup

acpi_perf_data is a percpu pointer but was missing __percpu markup.
Add it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 246cd3afbb5..cd8da247dda 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -72,7 +72,7 @@ struct acpi_cpufreq_data {
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 
 /* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
+static struct acpi_processor_performance __percpu *acpi_perf_data;
 
 static struct cpufreq_driver acpi_cpufreq_driver;
 
-- 
cgit v1.2.3-70-g09d2


From c7887325230aec47d47a32562a6e26014a0fafca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Aug 2010 11:26:22 +0100
Subject: Mark arguments to certain syscalls as being const

Mark arguments to certain system calls as being const where they should be but
aren't.  The list includes:

 (*) The filename arguments of various stat syscalls, execve(), various utimes
     syscalls and some mount syscalls.

 (*) The filename arguments of some syscall helpers relating to the above.

 (*) The buffer argument of various write syscalls.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/osf_sys.c             |  6 +++---
 arch/alpha/kernel/process.c             |  2 +-
 arch/arm/kernel/sys_arm.c               |  4 ++--
 arch/arm/kernel/sys_oabi-compat.c       |  6 +++---
 arch/avr32/include/asm/syscalls.h       |  2 +-
 arch/avr32/kernel/process.c             |  3 ++-
 arch/blackfin/kernel/process.c          |  2 +-
 arch/frv/kernel/process.c               |  3 ++-
 arch/h8300/kernel/process.c             |  2 +-
 arch/ia64/include/asm/unistd.h          |  2 +-
 arch/ia64/kernel/process.c              |  2 +-
 arch/m32r/kernel/process.c              |  3 ++-
 arch/m68k/kernel/process.c              |  2 +-
 arch/m68knommu/kernel/process.c         |  2 +-
 arch/microblaze/kernel/sys_microblaze.c |  2 +-
 arch/mips/kernel/syscall.c              |  2 +-
 arch/mn10300/kernel/process.c           |  2 +-
 arch/parisc/hpux/fs.c                   |  7 ++++---
 arch/powerpc/kernel/process.c           |  2 +-
 arch/powerpc/kernel/sys_ppc32.c         |  2 +-
 arch/s390/kernel/compat_linux.c         | 10 +++++-----
 arch/s390/kernel/compat_linux.h         | 10 +++++-----
 arch/s390/kernel/entry.h                |  2 +-
 arch/s390/kernel/process.c              |  2 +-
 arch/sh/include/asm/syscalls_32.h       |  2 +-
 arch/sh/include/asm/syscalls_64.h       |  2 +-
 arch/sh/kernel/process_64.c             |  2 +-
 arch/sparc/kernel/sys_sparc32.c         |  7 ++++---
 arch/um/kernel/exec.c                   |  6 +++---
 arch/um/kernel/internal.h               |  2 +-
 arch/um/kernel/syscall.c                |  2 +-
 arch/x86/ia32/sys_ia32.c                | 14 +++++++-------
 arch/x86/include/asm/sys_ia32.h         | 12 ++++++------
 arch/x86/include/asm/syscalls.h         |  2 +-
 arch/x86/kernel/entry_64.S              |  4 ++--
 arch/x86/kernel/process.c               |  2 +-
 arch/xtensa/kernel/process.c            |  2 +-
 fs/compat.c                             | 23 +++++++++++++----------
 fs/stat.c                               | 29 ++++++++++++++++++-----------
 fs/utimes.c                             |  7 ++++---
 include/linux/compat.h                  |  6 +++---
 include/linux/fs.h                      |  6 +++---
 include/linux/syscalls.h                | 20 ++++++++++----------
 include/linux/time.h                    |  2 +-
 44 files changed, 125 insertions(+), 109 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 88131c6e42e..fb58150a7e8 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -244,7 +244,7 @@ do_osf_statfs(struct path *path, struct osf_statfs __user *buffer,
 	return error;	
 }
 
-SYSCALL_DEFINE3(osf_statfs, char __user *, pathname,
+SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
 		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
 {
 	struct path path;
@@ -358,7 +358,7 @@ osf_procfs_mount(char *dirname, struct procfs_args __user *args, int flags)
 	return do_mount("", dirname, "proc", flags, NULL);
 }
 
-SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
+SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path,
 		int, flag, void __user *, data)
 {
 	int retval;
@@ -932,7 +932,7 @@ SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in,
 
 }
 
-SYSCALL_DEFINE2(osf_utimes, char __user *, filename,
+SYSCALL_DEFINE2(osf_utimes, const char __user *, filename,
 		struct timeval32 __user *, tvs)
 {
 	struct timespec tv[2];
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 395a464353b..88e608aebc8 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -387,7 +387,7 @@ EXPORT_SYMBOL(dump_elf_task_fp);
  * sys_execve() executes a new program.
  */
 asmlinkage int
-do_sys_execve(char __user *ufilename, char __user * __user *argv,
+do_sys_execve(const char __user *ufilename, char __user * __user *argv,
 	      char __user * __user *envp, struct pt_regs *regs)
 {
 	int error;
diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index c23501842b9..5b7c541a4c6 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -62,7 +62,7 @@ asmlinkage int sys_vfork(struct pt_regs *regs)
 /* sys_execve() executes a new program.
  * This is called indirectly via a small wrapper
  */
-asmlinkage int sys_execve(char __user *filenamei, char __user * __user *argv,
+asmlinkage int sys_execve(const char __user *filenamei, char __user * __user *argv,
 			  char __user * __user *envp, struct pt_regs *regs)
 {
 	int error;
@@ -84,7 +84,7 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
 	int ret;
 
 	memset(&regs, 0, sizeof(struct pt_regs));
-	ret = do_execve((char *)filename, (char __user * __user *)argv,
+	ret = do_execve(filename, (char __user * __user *)argv,
 			(char __user * __user *)envp, &regs);
 	if (ret < 0)
 		goto out;
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 33ff678e32f..4ad8da15ef2 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -141,7 +141,7 @@ static long cp_oldabi_stat64(struct kstat *stat,
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_oabi_stat64(char __user * filename,
+asmlinkage long sys_oabi_stat64(const char __user * filename,
 				struct oldabi_stat64 __user * statbuf)
 {
 	struct kstat stat;
@@ -151,7 +151,7 @@ asmlinkage long sys_oabi_stat64(char __user * filename,
 	return error;
 }
 
-asmlinkage long sys_oabi_lstat64(char __user * filename,
+asmlinkage long sys_oabi_lstat64(const char __user * filename,
 				 struct oldabi_stat64 __user * statbuf)
 {
 	struct kstat stat;
@@ -172,7 +172,7 @@ asmlinkage long sys_oabi_fstat64(unsigned long fd,
 }
 
 asmlinkage long sys_oabi_fstatat64(int dfd,
-				   char __user *filename,
+				   const char __user *filename,
 				   struct oldabi_stat64  __user *statbuf,
 				   int flag)
 {
diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h
index 66a19726663..ab608b70b24 100644
--- a/arch/avr32/include/asm/syscalls.h
+++ b/arch/avr32/include/asm/syscalls.h
@@ -21,7 +21,7 @@ asmlinkage int sys_clone(unsigned long, unsigned long,
 			 unsigned long, unsigned long,
 			 struct pt_regs *);
 asmlinkage int sys_vfork(struct pt_regs *);
-asmlinkage int sys_execve(char __user *, char __user *__user *,
+asmlinkage int sys_execve(const char __user *, char __user *__user *,
 			  char __user *__user *, struct pt_regs *);
 
 /* kernel/signal.c */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 2d76515745a..e5daddff397 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -383,7 +383,8 @@ asmlinkage int sys_vfork(struct pt_regs *regs)
 		       0, NULL, NULL);
 }
 
-asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv,
+asmlinkage int sys_execve(const char __user *ufilename,
+			  char __user *__user *uargv,
 			  char __user *__user *uenvp, struct pt_regs *regs)
 {
 	int error;
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 93ec07da2e5..a566f61c002 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -209,7 +209,7 @@ copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp)
 {
 	int error;
 	char *filename;
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 21d0fd19276..428931cf2f0 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -250,7 +250,8 @@ int copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name, char __user * __user *argv,
+			  char __user * __user *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 8c8b0ffa6ad..8b7b78d77d5 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -212,7 +212,7 @@ int copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char *name, char **argv, char **envp,int dummy,...)
+asmlinkage int sys_execve(const char *name, char **argv, char **envp,int dummy,...)
 {
 	int error;
 	char * filename;
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index bb8b0fff32b..46f36fc5125 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -353,7 +353,7 @@ asmlinkage unsigned long sys_mmap2(
 				int fd, long pgoff);
 struct pt_regs;
 struct sigaction;
-long sys_execve(char __user *filename, char __user * __user *argv,
+long sys_execve(const char __user *filename, char __user * __user *argv,
 			   char __user * __user *envp, struct pt_regs *regs);
 asmlinkage long sys_ia64_pipe(void);
 asmlinkage long sys_rt_sigaction(int sig,
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 53f1648c8b8..a879c03b7f1 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -633,7 +633,7 @@ dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
 }
 
 long
-sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
+sys_execve (const char __user *filename, char __user * __user *argv, char __user * __user *envp,
 	    struct pt_regs *regs)
 {
 	char *fname;
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index bc8c8c1511b..8665a4d868e 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -288,7 +288,8 @@ asmlinkage int sys_vfork(unsigned long r0, unsigned long r1, unsigned long r2,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
+asmlinkage int sys_execve(const char __user *ufilename,
+			  char __user * __user *uargv,
 			  char __user * __user *uenvp,
 			  unsigned long r3, unsigned long r4, unsigned long r5,
 			  unsigned long r6, struct pt_regs regs)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 1a6be27cf16..221d0b71ce3 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -315,7 +315,7 @@ EXPORT_SYMBOL(dump_fpu);
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 6aa66134b43..6350f68cd02 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -350,7 +350,7 @@ void dump(struct pt_regs *fp)
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char *name, char **argv, char **envp)
+asmlinkage int sys_execve(const char *name, char **argv, char **envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index f4e00b7f125..6abab6ebedb 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -47,7 +47,7 @@ asmlinkage long microblaze_clone(int flags, unsigned long stack, struct pt_regs
 	return do_fork(flags, stack, regs, 0, NULL, NULL);
 }
 
-asmlinkage long microblaze_execve(char __user *filenamei, char __user *__user *argv,
+asmlinkage long microblaze_execve(const char __user *filenamei, char __user *__user *argv,
 			char __user *__user *envp, struct pt_regs *regs)
 {
 	int error;
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 58bab2ef257..bddce0bca19 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -254,7 +254,7 @@ asmlinkage int sys_execve(nabi_no_regargs struct pt_regs regs)
 	int error;
 	char * filename;
 
-	filename = getname((char __user *) (long)regs.regs[4]);
+	filename = getname((const char __user *) (long)regs.regs[4]);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 82b817c7f7b..762eb325b94 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -268,7 +268,7 @@ asmlinkage long sys_vfork(void)
 		       0, NULL, NULL);
 }
 
-asmlinkage long sys_execve(char __user *name,
+asmlinkage long sys_execve(const char __user *name,
 			   char __user * __user *argv,
 			   char __user * __user *envp)
 {
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 6935123178e..1444875a761 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -36,7 +36,7 @@ int hpux_execve(struct pt_regs *regs)
 	int error;
 	char *filename;
 
-	filename = getname((char __user *) regs->gr[26]);
+	filename = getname((const char __user *) regs->gr[26]);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
@@ -169,7 +169,7 @@ static int cp_hpux_stat(struct kstat *stat, struct hpux_stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-long hpux_stat64(char __user *filename, struct hpux_stat64 __user *statbuf)
+long hpux_stat64(const char __user *filename, struct hpux_stat64 __user *statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -191,7 +191,8 @@ long hpux_fstat64(unsigned int fd, struct hpux_stat64 __user *statbuf)
 	return error;
 }
 
-long hpux_lstat64(char __user *filename, struct hpux_stat64 __user *statbuf)
+long hpux_lstat64(const char __user *filename,
+		  struct hpux_stat64 __user *statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e78a5add7f1..feacfb78968 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1027,7 +1027,7 @@ int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
 	int error;
 	char *filename;
 
-	filename = getname((char __user *) a0);
+	filename = getname((const char __user *) a0);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 19471a1cef1..20fd701a686 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -546,7 +546,7 @@ compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_siz
 	return sys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
 }
 
-compat_ssize_t compat_sys_pwrite64(unsigned int fd, char __user *ubuf, compat_size_t count,
+compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count,
 			      u32 reg6, u32 poshi, u32 poslo)
 {
 	return sys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 73b624ed9cd..1e6449c79ab 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -436,7 +436,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
  * sys32_execve() executes a new program after the asm stub has set
  * things up for us.  This should basically do what I want it to.
  */
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp)
 {
 	struct pt_regs *regs = task_pt_regs(current);
@@ -570,7 +570,7 @@ static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat)
 	return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 
 }
 
-asmlinkage long sys32_stat64(char __user * filename, struct stat64_emu31 __user * statbuf)
+asmlinkage long sys32_stat64(const char __user * filename, struct stat64_emu31 __user * statbuf)
 {
 	struct kstat stat;
 	int ret = vfs_stat(filename, &stat);
@@ -579,7 +579,7 @@ asmlinkage long sys32_stat64(char __user * filename, struct stat64_emu31 __user
 	return ret;
 }
 
-asmlinkage long sys32_lstat64(char __user * filename, struct stat64_emu31 __user * statbuf)
+asmlinkage long sys32_lstat64(const char __user * filename, struct stat64_emu31 __user * statbuf)
 {
 	struct kstat stat;
 	int ret = vfs_lstat(filename, &stat);
@@ -597,7 +597,7 @@ asmlinkage long sys32_fstat64(unsigned long fd, struct stat64_emu31 __user * sta
 	return ret;
 }
 
-asmlinkage long sys32_fstatat64(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat64(unsigned int dfd, const char __user *filename,
 				struct stat64_emu31 __user* statbuf, int flag)
 {
 	struct kstat stat;
@@ -655,7 +655,7 @@ asmlinkage long sys32_read(unsigned int fd, char __user * buf, size_t count)
 	return sys_read(fd, buf, count);
 }
 
-asmlinkage long sys32_write(unsigned int fd, char __user * buf, size_t count)
+asmlinkage long sys32_write(unsigned int fd, const char __user * buf, size_t count)
 {
 	if ((compat_ssize_t) count < 0)
 		return -EINVAL; 
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index cb97afc85c9..9635d759c2b 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -193,7 +193,7 @@ long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
 			  compat_sigset_t __user *oset, size_t sigsetsize);
 long sys32_rt_sigpending(compat_sigset_t __user *set, size_t sigsetsize);
 long sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo);
-long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 		  compat_uptr_t __user *envp);
 long sys32_init_module(void __user *umod, unsigned long len,
 		       const char __user *uargs);
@@ -207,16 +207,16 @@ long sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset,
 		    size_t count);
 long sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset,
 		      s32 count);
-long sys32_stat64(char __user * filename, struct stat64_emu31 __user * statbuf);
-long sys32_lstat64(char __user * filename,
+long sys32_stat64(const char __user * filename, struct stat64_emu31 __user * statbuf);
+long sys32_lstat64(const char __user * filename,
 		   struct stat64_emu31 __user * statbuf);
 long sys32_fstat64(unsigned long fd, struct stat64_emu31 __user * statbuf);
-long sys32_fstatat64(unsigned int dfd, char __user *filename,
+long sys32_fstatat64(unsigned int dfd, const char __user *filename,
 		     struct stat64_emu31 __user* statbuf, int flag);
 unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg);
 long sys32_mmap2(struct mmap_arg_struct_emu31 __user *arg);
 long sys32_read(unsigned int fd, char __user * buf, size_t count);
-long sys32_write(unsigned int fd, char __user * buf, size_t count);
+long sys32_write(unsigned int fd, const char __user * buf, size_t count);
 long sys32_fadvise64(int fd, loff_t offset, size_t len, int advise);
 long sys32_fadvise64_64(struct fadvise64_64_args __user *args);
 long sys32_sigaction(int sig, const struct old_sigaction32 __user *act,
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 5bb1e6b5db2..403fb430a89 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -42,7 +42,7 @@ long sys_clone(unsigned long newsp, unsigned long clone_flags,
 	       int __user *parent_tidptr, int __user *child_tidptr);
 long sys_vfork(void);
 void execve_tail(void);
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name, char __user * __user *argv,
 		char __user * __user *envp);
 long sys_sigsuspend(int history0, int history1, old_sigset_t mask);
 long sys_sigaction(int sig, const struct old_sigaction __user *act,
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 1039fdea15b..7eafaf2662b 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -267,7 +267,7 @@ asmlinkage void execve_tail(void)
 /*
  * sys_execve() executes a new program.
  */
-SYSCALL_DEFINE3(execve, char __user *, name, char __user * __user *, argv,
+SYSCALL_DEFINE3(execve, const char __user *, name, char __user * __user *, argv,
 		char __user * __user *, envp)
 {
 	struct pt_regs *regs = task_pt_regs(current);
diff --git a/arch/sh/include/asm/syscalls_32.h b/arch/sh/include/asm/syscalls_32.h
index 8b30200305c..be201fdc97a 100644
--- a/arch/sh/include/asm/syscalls_32.h
+++ b/arch/sh/include/asm/syscalls_32.h
@@ -19,7 +19,7 @@ asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
 asmlinkage int sys_vfork(unsigned long r4, unsigned long r5,
 			 unsigned long r6, unsigned long r7,
 			 struct pt_regs __regs);
-asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
+asmlinkage int sys_execve(const char __user *ufilename, char __user * __user *uargv,
 			  char __user * __user *uenvp, unsigned long r7,
 			  struct pt_regs __regs);
 asmlinkage int sys_sigsuspend(old_sigset_t mask, unsigned long r5,
diff --git a/arch/sh/include/asm/syscalls_64.h b/arch/sh/include/asm/syscalls_64.h
index 751fd881136..ee519f41d95 100644
--- a/arch/sh/include/asm/syscalls_64.h
+++ b/arch/sh/include/asm/syscalls_64.h
@@ -21,7 +21,7 @@ asmlinkage int sys_vfork(unsigned long r2, unsigned long r3,
 			 unsigned long r4, unsigned long r5,
 			 unsigned long r6, unsigned long r7,
 			 struct pt_regs *pregs);
-asmlinkage int sys_execve(char *ufilename, char **uargv,
+asmlinkage int sys_execve(const char *ufilename, char **uargv,
 			  char **uenvp, unsigned long r5,
 			  unsigned long r6, unsigned long r7,
 			  struct pt_regs *pregs);
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index d4ca6480e35..68d128d651b 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -483,7 +483,7 @@ asmlinkage int sys_vfork(unsigned long r2, unsigned long r3,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char *ufilename, char **uargv,
+asmlinkage int sys_execve(const char *ufilename, char **uargv,
 			  char **uenvp, unsigned long r5,
 			  unsigned long r6, unsigned long r7,
 			  struct pt_regs *pregs)
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index c0ca87553e1..e6375a750d9 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -162,7 +162,7 @@ static int cp_compat_stat64(struct kstat *stat,
 	return err;
 }
 
-asmlinkage long compat_sys_stat64(char __user * filename,
+asmlinkage long compat_sys_stat64(const char __user * filename,
 		struct compat_stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -173,7 +173,7 @@ asmlinkage long compat_sys_stat64(char __user * filename,
 	return error;
 }
 
-asmlinkage long compat_sys_lstat64(char __user * filename,
+asmlinkage long compat_sys_lstat64(const char __user * filename,
 		struct compat_stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -195,7 +195,8 @@ asmlinkage long compat_sys_fstat64(unsigned int fd,
 	return error;
 }
 
-asmlinkage long compat_sys_fstatat64(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_fstatat64(unsigned int dfd,
+		const char __user *filename,
 		struct compat_stat64 __user * statbuf, int flag)
 {
 	struct kstat stat;
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 97974c1bdd1..59b20d93b6d 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -44,7 +44,7 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
 	PT_REGS_SP(regs) = esp;
 }
 
-static long execve1(char *file, char __user * __user *argv,
+static long execve1(const char *file, char __user * __user *argv,
 		    char __user *__user *env)
 {
 	long error;
@@ -61,7 +61,7 @@ static long execve1(char *file, char __user * __user *argv,
 	return error;
 }
 
-long um_execve(char *file, char __user *__user *argv, char __user *__user *env)
+long um_execve(const char *file, char __user *__user *argv, char __user *__user *env)
 {
 	long err;
 
@@ -71,7 +71,7 @@ long um_execve(char *file, char __user *__user *argv, char __user *__user *env)
 	return err;
 }
 
-long sys_execve(char __user *file, char __user *__user *argv,
+long sys_execve(const char __user *file, char __user *__user *argv,
 		char __user *__user *env)
 {
 	long error;
diff --git a/arch/um/kernel/internal.h b/arch/um/kernel/internal.h
index 3bda43c7a78..1303a105fe9 100644
--- a/arch/um/kernel/internal.h
+++ b/arch/um/kernel/internal.h
@@ -1 +1 @@
-extern long um_execve(char *file, char __user *__user *argv, char __user *__user *env);
+extern long um_execve(const char *file, char __user *__user *argv, char __user *__user *env);
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index 4393173923f..7427c0b1930 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -58,7 +58,7 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
-	ret = um_execve((char *)filename, (char __user *__user *)argv,
+	ret = um_execve(filename, (char __user *__user *)argv,
 			(char __user *__user *) envp);
 	set_fs(fs);
 
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 3d093311d5e..849813f398e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -51,7 +51,7 @@
 #define AA(__x)		((unsigned long)(__x))
 
 
-asmlinkage long sys32_truncate64(char __user *filename,
+asmlinkage long sys32_truncate64(const char __user *filename,
 				 unsigned long offset_low,
 				 unsigned long offset_high)
 {
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
 	return 0;
 }
 
-asmlinkage long sys32_stat64(char __user *filename,
+asmlinkage long sys32_stat64(const char __user *filename,
 			     struct stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename,
 	return ret;
 }
 
-asmlinkage long sys32_lstat64(char __user *filename,
+asmlinkage long sys32_lstat64(const char __user *filename,
 			      struct stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
 	return ret;
 }
 
-asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
 			      struct stat64 __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
 			 ((loff_t)AA(poshi) << 32) | AA(poslo));
 }
 
-asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
-			     u32 poslo, u32 poshi)
+asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
+			     u32 count, u32 poslo, u32 poshi)
 {
 	return sys_pwrite64(fd, ubuf, count,
 			  ((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
 	return ret;
 }
 
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp, struct pt_regs *regs)
 {
 	long error;
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index cf4e2e381cb..cb238526a9f 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -18,13 +18,13 @@
 #include <asm/ia32.h>
 
 /* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
 asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
 
-asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
 asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, char __user *,
+asmlinkage long sys32_fstatat(unsigned int, const char __user *,
 			      struct stat64 __user *, int);
 struct mmap_arg_struct32;
 asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
 asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
 
 asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
 
 asmlinkage long sys32_personality(unsigned long);
 asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
 
-asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
 			     compat_uptr_t __user *, struct pt_regs *);
 asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
 
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b43e9a..feb2ff9bfc2 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *);
 /* kernel/process.c */
 int sys_fork(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
-long sys_execve(char __user *, char __user * __user *,
+long sys_execve(const char __user *, char __user * __user *,
 		char __user * __user *, struct pt_regs *);
 long sys_clone(unsigned long, unsigned long, void __user *,
 	       void __user *, struct pt_regs *);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c5ea5cdbe7b..17be5ec7cbb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
  * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
  *
  * C extern interface:
- *	 extern long execve(char *name, char **argv, char **envp)
+ *	 extern long execve(const char *name, char **argv, char **envp)
  *
  * asm input arguments:
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
  *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d401f1d2d06..64ecaf0af9a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread);
 /*
  * sys_execve() executes a new program.
  */
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name, char __user * __user *argv,
 		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index f167e0f5e05..7c2f38f68eb 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -318,7 +318,7 @@ long xtensa_clone(unsigned long clone_flags, unsigned long newsp,
  */
 
 asmlinkage
-long xtensa_execve(char __user *name, char __user * __user *argv,
+long xtensa_execve(const char __user *name, char __user * __user *argv,
                    char __user * __user *envp,
                    long a3, long a4, long a5,
                    struct pt_regs *regs)
diff --git a/fs/compat.c b/fs/compat.c
index e6d5d70cf3c..718c7062aec 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -77,7 +77,8 @@ int compat_printk(const char *fmt, ...)
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
  */
-asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t)
+asmlinkage long compat_sys_utime(const char __user *filename,
+				 struct compat_utimbuf __user *t)
 {
 	struct timespec tv[2];
 
@@ -91,7 +92,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
 {
 	struct timespec tv[2];
 
@@ -106,7 +107,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
 	return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
 
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
 {
 	struct timespec tv[2];
 
@@ -125,7 +126,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
 	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
 {
 	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
@@ -169,7 +170,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 	return err;
 }
 
-asmlinkage long compat_sys_newstat(char __user * filename,
+asmlinkage long compat_sys_newstat(const char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
@@ -181,7 +182,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
 	return cp_compat_stat(&stat, statbuf);
 }
 
-asmlinkage long compat_sys_newlstat(char __user * filename,
+asmlinkage long compat_sys_newlstat(const char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
@@ -194,7 +195,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 }
 
 #ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+		const char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -837,9 +839,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
 
-asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
-				 char __user * type, unsigned long flags,
-				 void __user * data)
+asmlinkage long compat_sys_mount(const char __user * dev_name,
+				 const char __user * dir_name,
+				 const char __user * type, unsigned long flags,
+				 const void __user * data)
 {
 	char *kernel_type;
 	unsigned long data_page;
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c573..12e90e21390 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 }
 EXPORT_SYMBOL(vfs_fstat);
 
-int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+		int flag)
 {
 	struct path path;
 	int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
 }
 EXPORT_SYMBOL(vfs_fstatat);
 
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat(const char __user *name, struct kstat *stat)
 {
 	return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
 EXPORT_SYMBOL(vfs_stat);
 
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat(const char __user *name, struct kstat *stat)
 {
 	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
 }
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
 	return cp_old_stat(&stat, statbuf);
 }
 
-SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 	return cp_new_stat(&stat, statbuf);
 }
 
-SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
 	return error;
 }
 
-SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 	return error;
 }
 
-SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d37..179b5869065 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+	       int flags)
 {
 	int error = -EINVAL;
 
@@ -170,7 +171,7 @@ out:
 	return error;
 }
 
-SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
 		struct timespec __user *, utimes, int, flags)
 {
 	struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
 
-SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
 		struct timeval __user *, utimes)
 {
 	struct timeval times[2];
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 168f7daa7bd..9ddc8780e8d 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
 
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename,
 				struct compat_timespec __user *t, int flags);
 
 asmlinkage long compat_sys_signalfd(int ufd,
@@ -348,9 +348,9 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
 				      const int __user *nodes,
 				      int __user *status,
 				      int flags);
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename,
 				     struct compat_timeval __user *t);
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd, const char __user * filename,
 				      struct compat_stat __user *statbuf,
 				      int flag);
 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a0625e26a3..5f0ca2fbb2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,10 +2320,10 @@ void inode_set_bytes(struct inode *inode, loff_t bytes);
 
 extern int vfs_readdir(struct file *, filldir_t, void *);
 
-extern int vfs_stat(char __user *, struct kstat *);
-extern int vfs_lstat(char __user *, struct kstat *);
+extern int vfs_stat(const char __user *, struct kstat *);
+extern int vfs_lstat(const char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
-extern int vfs_fstatat(int , char __user *, struct kstat *, int);
+extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
 
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1b67bd333b5..6e5d1978863 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -394,7 +394,7 @@ asmlinkage long sys_umount(char __user *name, int flags);
 asmlinkage long sys_oldumount(char __user *name);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
-asmlinkage long sys_stat(char __user *filename,
+asmlinkage long sys_stat(const char __user *filename,
 			struct __old_kernel_stat __user *statbuf);
 asmlinkage long sys_statfs(const char __user * path,
 				struct statfs __user *buf);
@@ -403,21 +403,21 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 				struct statfs64 __user *buf);
-asmlinkage long sys_lstat(char __user *filename,
+asmlinkage long sys_lstat(const char __user *filename,
 			struct __old_kernel_stat __user *statbuf);
 asmlinkage long sys_fstat(unsigned int fd,
 			struct __old_kernel_stat __user *statbuf);
-asmlinkage long sys_newstat(char __user *filename,
+asmlinkage long sys_newstat(const char __user *filename,
 				struct stat __user *statbuf);
-asmlinkage long sys_newlstat(char __user *filename,
+asmlinkage long sys_newlstat(const char __user *filename,
 				struct stat __user *statbuf);
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
 asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
 #if BITS_PER_LONG == 32
-asmlinkage long sys_stat64(char __user *filename,
+asmlinkage long sys_stat64(const char __user *filename,
 				struct stat64 __user *statbuf);
 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
-asmlinkage long sys_lstat64(char __user *filename,
+asmlinkage long sys_lstat64(const char __user *filename,
 				struct stat64 __user *statbuf);
 asmlinkage long sys_truncate64(const char __user *path, loff_t length);
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
@@ -760,7 +760,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 			   int newdfd, const char __user *newname, int flags);
 asmlinkage long sys_renameat(int olddfd, const char __user * oldname,
 			     int newdfd, const char __user * newname);
-asmlinkage long sys_futimesat(int dfd, char __user *filename,
+asmlinkage long sys_futimesat(int dfd, const char __user *filename,
 			      struct timeval __user *utimes);
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
 asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
@@ -769,13 +769,13 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 			     gid_t group, int flag);
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode);
-asmlinkage long sys_newfstatat(int dfd, char __user *filename,
+asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
 			       struct stat __user *statbuf, int flag);
-asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
 			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
-asmlinkage long sys_utimensat(int dfd, char __user *filename,
+asmlinkage long sys_utimensat(int dfd, const char __user *filename,
 				struct timespec __user *utimes, int flags);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
 
diff --git a/include/linux/time.h b/include/linux/time.h
index 12612701b1a..9f15ac7ab92 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -150,7 +150,7 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
-extern long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags);
+extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
-- 
cgit v1.2.3-70-g09d2